Null Handling In Snowflake While Loading CSV Data

Null Handling In Snowflake While Loading CSV Data

This blogs explain how to handling null fields in your data file, be it CSV or delimiter separated files and load the data set effectively in your snowflake tables.

Null Handling In Snowflake While Loading CSV Data

Null handling in snowflake while loading data into tables might be a confusing topic for many data engineers. There are couple of paramters and default setting that can be used to make sure snowflake data loading works as expected.

This episode will describe different scenario and null representation in csv data file and how paramter at file format level or at copy command level help data engineers to load them correctly and efficiently. Once you complete this video, you will be able to answer following questions

  1. Null Handling Consideration In Snowflake with CSV data files.
  2. Different File Format level parameters to control null values?
  3. Limitation with null values while loading data into snowflake.
  4. Performance impact with null values in snowflake.
  5. How to handle special cases to load null values into snowflake tables having not-null constraints.

Sample Data Set

Here is the data set which is used in this guide.

SQL Used In This Null Data Loading Article

Part-01


-- use schema ttips.ch09

create or replace transient table user(
    id number,
    first_name varchar(100),
    middle_name varchar(100),
    last_name varchar(100),
    email varchar(100),
    dob varchar(10)
);
-- Create a user table and simple csv file format
create or replace file format csv_ff 
    type = 'csv' 
    compression = 'none' 
    field_delimiter = ',' 
    record_delimiter = '\n' 
    skip_header = 1 
    field_optionally_enclosed_by = '\042';
    
-- put command to copy into user stage
put 
    file:///tmp/ch09/csv/01_user_sample_with_nulls.csv
    @~/ch09/csv 
    auto_compress=false;

-- list the user stage.
list @~/ch09/csv/;

copy into user
    from @~/ch09/csv/01_user_sample_with_nulls.csv
    file_format = csv_ff
    on_error = 'continue';
    
    -- all rows loaded, lets see data 
    
select * from user;

Part-02



create or replace transient table user_backslash_n(
    id number,
    first_name varchar(100),
    middle_name varchar(100),
    last_name varchar(100),
    email varchar(100),
    dob varchar(10)
);
-- put command to copy into user stage
put 
    file:///tmp/ch09/csv/02_user_sample_with_slash_N.csv
    @~/ch09/csv 
    auto_compress=false;

-- list the user stage.
list @~/ch09/csv/;

copy into user_backslash_n
    from @~/ch09/csv/02_user_sample_with_slash_N.csv
    file_format = csv_ff
    on_error = 'continue';
    
    -- all rows loaded, lets see data 
    
select * from user;

select * from user where middle_name is not null; -- find all uses having middle name.
select * from user where email is not null; -- find all uses having email address

Part-03


create or replace transient table user_null_if(
    id number,
    first_name varchar(100),
    middle_name varchar(100),
    last_name varchar(100),
    email varchar(100),
    dob varchar(10)
);

create or replace file format csv_ff_null 
    type = 'csv' 
    compression = 'none' 
    field_delimiter = ',' 
    record_delimiter = '\n' 
    skip_header = 1 
    field_optionally_enclosed_by = '\042'
    null_if = '\\N'
    escape_unenclosed_field  = '\\';

-- list the user stage.
list @~/ch09/csv/;

copy into user_null_if
    from @~/ch09/csv/
    file_format = csv_ff_null
    on_error = 'continue'
    force = true
    pattern='.*[.]csv';
    
    -- all rows loaded, lets see data 
    
select * from user_null_if;

select * from user_null_if where middle_name is not null; -- find all uses having middle name.
select * from user_null_if where email is not null; -- find all uses having email address

Part-04


create or replace transient table user_many_null_if(
    id number,
    first_name varchar(100),
    middle_name varchar(100),
    last_name varchar(100),
    email varchar(100),
    dob varchar(10)
);

create or replace file format csv_ff_many_null 
    type = 'csv' 
    compression = 'none' 
    field_delimiter = ',' 
    record_delimiter = '\n' 
    skip_header = 1 
    field_optionally_enclosed_by = '\042'
    null_if = ('\\N', 'Null', 'NULL', 'null', '\\n')
    escape_unenclosed_field  = '\\';

-- list the user stage.
list @~/ch09/csv/;

copy into user_many_null_if
    from @~/ch09/csv/
    file_format = csv_ff_many_null
    on_error = 'continue'
    force = true
    pattern='.*[.]csv';
    
    -- all rows loaded, lets see data 
    
select * from user_many_null_if;

Part-05


create or replace transient table user_empty_null(
    id number,
    first_name varchar(100),
    middle_name varchar(100),
    last_name varchar(100),
    email varchar(100),
    dob varchar(10)
);

create or replace file format csv_ff_empty_null 
    type = 'csv' 
    compression = 'none' 
    field_delimiter = ',' 
    record_delimiter = '\n' 
    skip_header = 1 
    field_optionally_enclosed_by = '\042'
    null_if = ('\\N', 'Null', 'NULL', 'null', '\\n','');
    escape_unenclosed_field  = '\\'
    empty_field_as_null = true; -- Can not I specify it in null_if

-- list the user stage.
list @~/ch09/csv/empty;

copy into user_empty_null
    from @~/ch09/csv/empty/03_user_sample_with_empty.csv
    file_format = csv_ff_empty_null
    on_error = 'continue'
    force = true;
--    validation_mode = return_errors;
    
    -- all rows loaded, lets see data 
    
select * from user_empty_null;

select * from user_many_null_if where middle_name is not null; -- find all uses having middle name.
select * from user_many_null_if where email is not null; -- find all uses having email address

-- lets change the file format
create or replace file format csv_ff_empty_null 
    type = 'csv' 
    compression = 'none' 
    field_delimiter = ',' 
    record_delimiter = '\n' 
    skip_header = 1 
    field_optionally_enclosed_by = '\042'
    null_if = ('\\N', 'Null', 'NULL', 'null', '\\n','');
    escape_unenclosed_field  = '\\'
    empty_field_as_null = true; -- Can not I specify it in null_if


Part-06


create or replace transient table user_not_null(
    id number,
    first_name varchar(100),
    middle_name varchar(100),
    last_name varchar(100),
    email varchar(100) not null,
    dob varchar(10)
);

create or replace file format csv_ff_not_null 
    type = 'csv' 
    compression = 'none' 
    field_delimiter = ',' 
    record_delimiter = '\n' 
    skip_header = 1 
    field_optionally_enclosed_by = '\042'
    null_if = ('\\N', 'Null', 'NULL', 'null', '\\n','');
    escape_unenclosed_field  = '\\'
    empty_field_as_null = true; -- Can not I specify it in null_if

-- list the user stage.
list @~/ch09/csv/empty;

copy into user_not_null
    from @~/ch09/csv/empty/03_user_sample_with_empty.csv
    file_format = csv_ff_not_null
    on_error = 'continue'
    force = true;

-- so how can you solve it.

truncate table user_not_null;

copy into user_not_null
    from 
     (
        select $1, $2, $3, $4, nvl($5,'not-known'),$6  from @~/ch09/csv/empty/03_user_sample_with_empty.csv tbl
    ) 
    
    file_format = csv_ff_not_null
    on_error = 'continue';
    force = true;
    
select * from user_not_null;

   

Part-07


put 
 file:///tmp/ch09/csv/01_customer_500k_with_nulls.csv.gz
 @~/ch09/csv/500k
 paralle=50;
                                         
-- here is my large files (csv compressed with gzip)
list  @~/ch09/csv/500k;

create or replace transient table customer_large (
	customer_pk number(38,0),
	salutation varchar(10),
	first_name varchar(20),
	last_name varchar(30),
	gender varchar(1),
	marital_status varchar(1),
	day_of_birth date,
	birth_country varchar(60),
	email_address varchar(50),
	city_name varchar(60),
	zip_code varchar(10),
	country_name varchar(20),
	gmt_timezone_offset number(10,2),
	preferred_cust_flag boolean,
	registration_time timestamp_ltz(9)
);

-- since they are gz files.. we need a new file format

create or replace file format csv_gz_ff
    type = 'csv' 
    compression = 'gzip' 
    field_delimiter = ','
    field_optionally_enclosed_by = '\042'
    --null_if = ('\\N', 'Null', 'NULL', 'null', '\\n')
    skip_header = 1 ;
    
    
-- copy command with validation mode = for all errors
copy into customer_large
    from @~/ch09/csv/500k/01_customer_500k_with_nulls.csv.gz
    file_format = csv_gz_ff
    on_error = 'continue'
    force = true;
    
select * from customer_large limit 10;

truncate customer_large;
 
 
 create or replace transient table customer_large_null (
	customer_pk number(38,0),
	salutation varchar(10),
	first_name varchar(20),
	last_name varchar(30),
	gender varchar(1),
	marital_status varchar(1),
	day_of_birth date,
	birth_country varchar(60),
	email_address varchar(50),
	city_name varchar(60),
	zip_code varchar(10),
	country_name varchar(20),
	gmt_timezone_offset number(10,2),
	preferred_cust_flag boolean,
	registration_time timestamp_ltz(9)
);

copy into customer_large_null
    from @~/ch09/csv/500k/01_customer_500k_with_nulls.csv.gz
    file_format = csv_gz_ff
    on_error = 'continue'
    force = true;