Null Handling In Snowflake While Loading CSV Data
Summary
This blogs explain how to handling null fields in your data file, be it CSV or delimiter separated files and load the data set effectively in your snowflake tables.
Null Handling In Snowflake While Loading CSV Data
Null handling in snowflake while loading data into tables might be a confusing topic for many data engineers. There are couple of paramters and default setting that can be used to make sure snowflake data loading works as expected.
This episode will describe different scenario and null representation in csv data file and how paramter at file format level or at copy command level help data engineers to load them correctly and efficiently. Once you complete this video, you will be able to answer following questions
- Null Handling Consideration In Snowflake with CSV data files.
- Different File Format level parameters to control null values?
- Limitation with null values while loading data into snowflake.
- Performance impact with null values in snowflake.
- How to handle special cases to load null values into snowflake tables having not-null constraints.
Sample Data Set
Here is the data set which is used in this guide.
SQL Used In This Null Data Loading Article
Part-01
-- use schema ttips.ch09
create or replace transient table user(
id number,
first_name varchar(100),
middle_name varchar(100),
last_name varchar(100),
email varchar(100),
dob varchar(10)
);
-- Create a user table and simple csv file format
create or replace file format csv_ff
type = 'csv'
compression = 'none'
field_delimiter = ','
record_delimiter = '\n'
skip_header = 1
field_optionally_enclosed_by = '\042';
-- put command to copy into user stage
put
file:///tmp/ch09/csv/01_user_sample_with_nulls.csv
@~/ch09/csv
auto_compress=false;
-- list the user stage.
list @~/ch09/csv/;
copy into user
from @~/ch09/csv/01_user_sample_with_nulls.csv
file_format = csv_ff
on_error = 'continue';
-- all rows loaded, lets see data
select * from user;
Part-02
create or replace transient table user_backslash_n(
id number,
first_name varchar(100),
middle_name varchar(100),
last_name varchar(100),
email varchar(100),
dob varchar(10)
);
-- put command to copy into user stage
put
file:///tmp/ch09/csv/02_user_sample_with_slash_N.csv
@~/ch09/csv
auto_compress=false;
-- list the user stage.
list @~/ch09/csv/;
copy into user_backslash_n
from @~/ch09/csv/02_user_sample_with_slash_N.csv
file_format = csv_ff
on_error = 'continue';
-- all rows loaded, lets see data
select * from user;
select * from user where middle_name is not null; -- find all uses having middle name.
select * from user where email is not null; -- find all uses having email address
Part-03
create or replace transient table user_null_if(
id number,
first_name varchar(100),
middle_name varchar(100),
last_name varchar(100),
email varchar(100),
dob varchar(10)
);
create or replace file format csv_ff_null
type = 'csv'
compression = 'none'
field_delimiter = ','
record_delimiter = '\n'
skip_header = 1
field_optionally_enclosed_by = '\042'
null_if = '\\N'
escape_unenclosed_field = '\\';
-- list the user stage.
list @~/ch09/csv/;
copy into user_null_if
from @~/ch09/csv/
file_format = csv_ff_null
on_error = 'continue'
force = true
pattern='.*[.]csv';
-- all rows loaded, lets see data
select * from user_null_if;
select * from user_null_if where middle_name is not null; -- find all uses having middle name.
select * from user_null_if where email is not null; -- find all uses having email address
Part-04
create or replace transient table user_many_null_if(
id number,
first_name varchar(100),
middle_name varchar(100),
last_name varchar(100),
email varchar(100),
dob varchar(10)
);
create or replace file format csv_ff_many_null
type = 'csv'
compression = 'none'
field_delimiter = ','
record_delimiter = '\n'
skip_header = 1
field_optionally_enclosed_by = '\042'
null_if = ('\\N', 'Null', 'NULL', 'null', '\\n')
escape_unenclosed_field = '\\';
-- list the user stage.
list @~/ch09/csv/;
copy into user_many_null_if
from @~/ch09/csv/
file_format = csv_ff_many_null
on_error = 'continue'
force = true
pattern='.*[.]csv';
-- all rows loaded, lets see data
select * from user_many_null_if;
Part-05
create or replace transient table user_empty_null(
id number,
first_name varchar(100),
middle_name varchar(100),
last_name varchar(100),
email varchar(100),
dob varchar(10)
);
create or replace file format csv_ff_empty_null
type = 'csv'
compression = 'none'
field_delimiter = ','
record_delimiter = '\n'
skip_header = 1
field_optionally_enclosed_by = '\042'
null_if = ('\\N', 'Null', 'NULL', 'null', '\\n','');
escape_unenclosed_field = '\\'
empty_field_as_null = true; -- Can not I specify it in null_if
-- list the user stage.
list @~/ch09/csv/empty;
copy into user_empty_null
from @~/ch09/csv/empty/03_user_sample_with_empty.csv
file_format = csv_ff_empty_null
on_error = 'continue'
force = true;
-- validation_mode = return_errors;
-- all rows loaded, lets see data
select * from user_empty_null;
select * from user_many_null_if where middle_name is not null; -- find all uses having middle name.
select * from user_many_null_if where email is not null; -- find all uses having email address
-- lets change the file format
create or replace file format csv_ff_empty_null
type = 'csv'
compression = 'none'
field_delimiter = ','
record_delimiter = '\n'
skip_header = 1
field_optionally_enclosed_by = '\042'
null_if = ('\\N', 'Null', 'NULL', 'null', '\\n','');
escape_unenclosed_field = '\\'
empty_field_as_null = true; -- Can not I specify it in null_if
Part-06
create or replace transient table user_not_null(
id number,
first_name varchar(100),
middle_name varchar(100),
last_name varchar(100),
email varchar(100) not null,
dob varchar(10)
);
create or replace file format csv_ff_not_null
type = 'csv'
compression = 'none'
field_delimiter = ','
record_delimiter = '\n'
skip_header = 1
field_optionally_enclosed_by = '\042'
null_if = ('\\N', 'Null', 'NULL', 'null', '\\n','');
escape_unenclosed_field = '\\'
empty_field_as_null = true; -- Can not I specify it in null_if
-- list the user stage.
list @~/ch09/csv/empty;
copy into user_not_null
from @~/ch09/csv/empty/03_user_sample_with_empty.csv
file_format = csv_ff_not_null
on_error = 'continue'
force = true;
-- so how can you solve it.
truncate table user_not_null;
copy into user_not_null
from
(
select $1, $2, $3, $4, nvl($5,'not-known'),$6 from @~/ch09/csv/empty/03_user_sample_with_empty.csv tbl
)
file_format = csv_ff_not_null
on_error = 'continue';
force = true;
select * from user_not_null;
Part-07
put
file:///tmp/ch09/csv/01_customer_500k_with_nulls.csv.gz
@~/ch09/csv/500k
paralle=50;
-- here is my large files (csv compressed with gzip)
list @~/ch09/csv/500k;
create or replace transient table customer_large (
customer_pk number(38,0),
salutation varchar(10),
first_name varchar(20),
last_name varchar(30),
gender varchar(1),
marital_status varchar(1),
day_of_birth date,
birth_country varchar(60),
email_address varchar(50),
city_name varchar(60),
zip_code varchar(10),
country_name varchar(20),
gmt_timezone_offset number(10,2),
preferred_cust_flag boolean,
registration_time timestamp_ltz(9)
);
-- since they are gz files.. we need a new file format
create or replace file format csv_gz_ff
type = 'csv'
compression = 'gzip'
field_delimiter = ','
field_optionally_enclosed_by = '\042'
--null_if = ('\\N', 'Null', 'NULL', 'null', '\\n')
skip_header = 1 ;
-- copy command with validation mode = for all errors
copy into customer_large
from @~/ch09/csv/500k/01_customer_500k_with_nulls.csv.gz
file_format = csv_gz_ff
on_error = 'continue'
force = true;
select * from customer_large limit 10;
truncate customer_large;
create or replace transient table customer_large_null (
customer_pk number(38,0),
salutation varchar(10),
first_name varchar(20),
last_name varchar(30),
gender varchar(1),
marital_status varchar(1),
day_of_birth date,
birth_country varchar(60),
email_address varchar(50),
city_name varchar(60),
zip_code varchar(10),
country_name varchar(20),
gmt_timezone_offset number(10,2),
preferred_cust_flag boolean,
registration_time timestamp_ltz(9)
);
copy into customer_large_null
from @~/ch09/csv/500k/01_customer_500k_with_nulls.csv.gz
file_format = csv_gz_ff
on_error = 'continue'
force = true;