Amsterdam 4 october 2022
Recently I had to check if a given number is an IBAN bank account number in a large dataset. There are quite a few different examples for code how to do this. But they all required access to files and binaries. As I usually interface with data via SQL, this is how to do it in PostgreSQL, with the use of native datatypes, but as a SQL script.
The function performs four tests: It checks if the minimum length of the IBAN is 15, If the first two characters are in [A-Z] and the second two characters are [0-9]. Also it checks for illegal characters, only alphanumeric and spaces are allowed. Note we could expand this check with checks for patterns of specific countries as some code examples do, however in practice this already captures 99.9% of the entries that aren't IBAN numbers. The final and time-expensive check is the modulo 97 on the number converted from the string. As PostgreSQL provides the datatype "numeric" which can hold arbitrarily large integers the implementation of the string to number and modulo operation is straightforward: <string>::numeric % 97. Replacing the characters with numbers is slightly more tricky. (See https://en.wikipedia.org/wiki/International_Bank_Account_Number#Validating_the_IBAN for an explanation of the algorithm). Note that it is also possible to incorporate this check in a (sub)query, it does not have to be a function as it is all standard SQl.
CREATE or replace FUNCTION check_iban(iban text) returns boolean
LANGUAGE SQL
AS $$
SELECT length(iban) >= 15 and iban ~ '^[A-Z][A-Z][0-9][0-9]' and iban ~ '^[[:alnum:] ]+$'
and (SELECT string_agg(new_iban::text,'')::numeric%97 = 1
FROM (SELECT case when ascii(my_split) >= 48 and ascii(my_split) <= 57 then my_split::int
when ascii(my_split) >= 65 and ascii(my_split) <= 90 then ascii(my_split)-55
else NULL
end as new_iban
FROM (select unnest(
string_to_array(
(select substring (iban,5,100)||substring (iban,1,4)), NULL) )
as my_split ) t) b)
-- Usage: select check_iban('NL42.RABO.1234.5678.90'::text) -- false
$$;
Amsterdam 5 october 2022
how to generate a pseudo random password string of 12 characters:
select substring(encode(decode(md5(random()::text),'hex'),'base64') from 1 for 12)
Amsterdam 10 october 2022
I found this on the web, really useful to load a csv file and create a table from its header:
CREATE OR REPLACE FUNCTION load_csv_file(
target_table text,
csv_path text,
col_count integer)
RETURNS void AS
$BODY$
declare
iter integer; -- dummy integer to iterate columns with
col text; -- variable to keep the column name at each iteration
col_first text; -- first column name, e.g., top left corner on a csv file or spreadsheet
begin
set schema 'public';
create table temp_table ();
-- add just enough number of columns
for iter in 1..col_count
loop
execute format('alter table temp_table add column col_%s text;', iter);
end loop;
-- copy the data from csv file
execute format('copy temp_table from %L with delimiter '','' quote ''"'' csv ', csv_path);
iter := 1;
col_first := (select col_1 from temp_table limit 1);
-- update the column names based on the first row which has the column names
for col in execute format('select unnest(string_to_array(trim(temp_table::text, ''()''), '','')) from temp_table where col_1 = %L', col_first)
loop
execute format('alter table temp_table rename column col_%s to %s', iter, col);
iter := iter + 1;
end loop;
-- delete the columns row
execute format('delete from temp_table where %s = %L', col_first, col_first);
-- change the temp table name to the name given as parameter, if not blank
if length(target_table) > 0 then
execute format('alter table temp_table rename to %I', target_table);
end if;
end;
$BODY$
LANGUAGE plpgsql VOLATILE
COST 100;
ALTER FUNCTION load_csv_file(text, text, integer)
OWNER TO postgres;
You can use:
alter table myTable alter column myColumn type numeric using (trim(myColumn)::numeric);
to later change the column types to something suitable.
Amsterdam 11 october 2022
Yes finally a generic load data from file:
-- FUNCTION: public.load_csv_file(text, text, integer)
-- DROP FUNCTION IF EXISTS public.load_csv_file(text, text,char, text);
CREATE OR REPLACE FUNCTION public.load_csv_file(
csv_path text,
target_table text,
non_delimiter char, -- this is a character that should not be in the file, as we read a whole line.
the_delimiter text -- this is the real delimiter, can be one or more characters
--perhaps add the NULL and escape character
)
RETURNS void
LANGUAGE 'plpgsql'
COST 100
VOLATILE PARALLEL UNSAFE
AS $BODY$
declare
--Usage: select load_csv_file('C:\Temp\testdata.dta','italia.testtable','@','|^|');
iter integer; -- dummy integer to iterate columns with
--col text; -- variable to keep the column name at each iteration
--col_first text; -- first column name, e.g., top left corner on a csv file or spreadsheet
data_array text[];
col_count integer;
select_string text;
begin
create temp table temp_table (the_data text, rec_id bigserial);
-- copy the data from csv file
execute format('copy temp_table(the_data) from %L with delimiter '''||non_delimiter||''' quote ''"'' csv ', csv_path);
-- get the column names from the header row and the number of columns.
data_array := (select string_to_array(temp_table.the_data::text, the_delimiter) from temp_table where rec_id = 1);
col_count := (select array_length(data_array,1));
-- create the actual table
execute format('create table %s();;', target_table);
-- add the columns
for iter in 1..col_count
loop
execute format('alter table %s add column %s text;',target_table, data_array[iter]);
end loop;
-- add the array fields to selection
select_string := 'the_array[1]';
for iter in 2..col_count
loop
select_string := select_string || ', the_array['||iter||']';
end loop;
-- select the array elements from all strings from the temp_table and insert as records
execute format('insert into %s (select %s
from (select string_to_array(temp_table.the_data::text, ''%s'') the_array from temp_table where rec_id > 1) t)
;',target_table,select_string,the_delimiter);
drop table temp_table;
end;
$BODY$;
Amsterdam 12 October 2022
A function to get total,filled,empty per column counts of a table. When a table name is given, it returns the stat for that table. When only a schema is given, it returns the stats for all the tables in the schema...
--Usage: select * from get_tablecounts('public','')
--Usage: select * from get_tablecounts('public','agreement')
CREATE OR REPLACE FUNCTION public.get_tablecounts( text,text)
RETURNS TABLE(tab_name text, col_name text, total bigint, filled bigint, empty bigint)
LANGUAGE 'plpgsql'
COST 100
VOLATILE PARALLEL UNSAFE
ROWS 1000
AS $BODY$
DECLARE
p_schema TEXT := $1;
p_tabname TEXT := $2;
v_sql_statement TEXT;
--Usage: select * from get_tablecounts('public','agreement')
--Usage: select * from get_tablecounts('public','')
BEGIN
SELECT STRING_AGG( '(SELECT '''
|| table_name
|| ''' as tn,'''
||column_name
|| ''' as cn,'
-- || ''','
-- || p_setcolumn
-- || ' as set ,'
|| '(select count('|| column_name || ') FROM '|| table_name || ') as total,'
|| '(select sum(case when ' || column_name || '::text != '''' then 1 else 0 end) FROM '|| table_name || ' ) as filled,'
|| '(select sum(case when ' || column_name || '::text isnull then 1 when ' || column_name || '::text = '''' then 1 else 0 end) FROM '|| table_name || ') as empty)'
,' UNION ALL ' ) INTO v_sql_statement
FROM information_schema.columns
WHERE table_schema = p_schema
AND case when p_tabname = '' then true else table_name = p_tabname end;
IF v_sql_statement IS NOT NULL THEN
RETURN QUERY EXECUTE v_sql_statement;
END IF;
END
$BODY$;
Amsterdam 13 october 2022
A function to check Primary keys and Foreign keys
--DROP FUNCTION compare_pk_fk(text,text,text,text)
CREATE OR REPLACE FUNCTION public.compare_pk_fk(text,text,text,text)
RETURNS TABLE("compare PK vs FK" text, PK text, FK text, PK_count bigint, FK_count bigint)
LANGUAGE 'plpgsql'
COST 100
VOLATILE PARALLEL UNSAFE
ROWS 1000
AS $BODY$
DECLARE
p_tabname1 TEXT := $1;
p_colname1 TEXT := $1||'.'||$2;
p_colname1_sec TEXT := $2;
p_tabname2 TEXT := $3;
p_colname2 TEXT := $3||'.'||$4;
p_colname2_sec TEXT := $4;
v_sql_statement TEXT;
--Usage: select * from compare_pk_fk('public.agreement', 'agreementnumber','public.agreementparty', 'agreementnumber')
--Usage: select * from compare_pk_fk('agreement', 'agreementnumber','agreementparty', 'agreementnumber')
--Usage: select * from compare_pk_fk('agreement', 'agreementnumber','agreementparty', 'agreementnumber')
--Usage: select * from compare_pk_fk('Party', 'PartyNumber','CollateralParty', 'PartyNumber');
BEGIN
SELECT -- nulls
'select ''null count'' ,'''
|| p_colname1
|| ''','''
|| p_colname2
|| ''', (select count('
||p_colname1
||') from '
||p_tabname1
||' where '
|| p_colname1||' = '''' or '||p_colname1||' isnull)'
||' ,'
||'(select count('
||p_colname2
||') from '
||p_tabname2
||' where '
|| p_colname2||' = '''' or '||p_colname2||' isnull)'
-- total
|| ' union '
|| ' select ''total records'','''
|| p_colname1
|| ''','''
|| p_colname2
|| ''', (select count('
||p_colname1
||' ) from '
|| p_tabname1
|| ') , (select count('
||p_colname2
||') from '
|| p_tabname2
|| ')'
-- total unique
|| ' union '
|| ' select ''total unique records'','''
|| p_colname1
|| ''','''
|| p_colname2
|| ''', (select count(distinct '
||p_colname1
||' ) from '
|| p_tabname1
|| ') , (select count(distinct '
||p_colname2
||') from '
|| p_tabname2
|| ')'
--distinct same
|| 'union select ''in both'', '''
|| p_colname1
|| ''','''
|| p_colname2
|| ''', (select count('|| p_colname1|| ') from '||p_tabname1
|| ' left join (select distinct '||p_colname2||' from '||p_tabname2||' ) x on '
|| p_colname1|| ' = x.'||p_colname2_sec||' where x.'||p_colname2_sec||' notnull), '
|| '(select count('|| p_colname2|| ') from '||p_tabname2
|| ' left join (select distinct '||p_colname1||' from '||p_tabname1||' ) x on '
|| p_colname2|| ' = x.'||p_colname1_sec||' where x.'||p_colname1_sec||' notnull) '
--distinct different
|| 'union select ''not in other'', '''
|| p_colname1
|| ''','''
|| p_colname2
|| ''', (select count('|| p_colname1|| ') from '||p_tabname1
|| ' left join (select distinct '||p_colname2||' from '||p_tabname2||' ) x on '
|| p_colname1|| ' = x.'||p_colname2_sec||' where x.'||p_colname2_sec||' isnull), '
|| '(select count('|| p_colname2|| ') from '||p_tabname2
|| ' left join (select distinct '||p_colname1||' from '||p_tabname1||' ) x on '
|| p_colname2|| ' = x.'||p_colname1_sec||' where x.'||p_colname1_sec||' isnull) '
INTO v_sql_statement;
IF v_sql_statement IS NOT NULL THEN
RETURN QUERY EXECUTE v_sql_statement;
END IF;
END
$BODY$;