Snowflake Micro Partition
Snowflake Micro Partition
Most of the enterprise data warehouses support data partition to optimize the IO operation and speed up the query performance. Snowflake cloud data warehouse also has an automatic data clustering feature and strongly claims that developers don’t need to partition data unless the table has TBs of data. All the data in tables is automatically divided into micro-partitions, which are contiguous units of storage. Snowflake is columnar-based and horizontally partitioned, meaning a row of data is stored in the same micro-partition.
Deep Dive Snowflake Micro Partition & Clustering Concept
This video tutorial provides complete coverage of of snowflake’s micro partition concept.
SQL Script
Creating Tables Using Cluster Key
// Step-1
// Create a table having no cluster key
create or replace table t1_no_cluster (
o_orderkey number(38,0),
o_custkey number(38,0),
o_orderstatus varchar(1),
o_totalprice number(12,2),
o_orderdate date,
o_orderpriority varchar(15),
o_clerk varchar(15),
o_shippriority number(38,0),
o_comment varchar(79)
);
insert into t1_no_cluster select * from SNOWFLAKE_SAMPLE_DATA.TPCH_SF1000.ORDERS limit 500000000;
// table t2 having order date as cluster key
create or replace table t2_order_dt (
o_orderkey number(38,0),
o_custkey number(38,0),
o_orderstatus varchar(1),
o_totalprice number(12,2),
o_orderdate date,
o_orderpriority varchar(15),
o_clerk varchar(15),
o_shippriority number(38,0),
o_comment varchar(79)
)
cluster by (o_orderdate);
insert into t2_order_dt select * from SNOWFLAKE_SAMPLE_DATA.TPCH_SF1000.ORDERS limit 500000000;
// create at table having order priority as cluster key
create or replace table t3_order_priority (
o_orderkey number(38,0),
o_custkey number(38,0),
o_orderstatus varchar(1),
o_totalprice number(12,2),
o_orderdate date,
o_orderpriority varchar(15),
o_clerk varchar(15),
o_shippriority number(38,0),
o_comment varchar(79)
)
cluster by (o_orderpriority);
insert into t3_order_priority select * from SNOWFLAKE_SAMPLE_DATA.TPCH_SF1000.ORDERS limit 500000000;
// Step-2
//let me populat data and here I am fast forwarding it
//as populating 5M in each table taks sometime..
//Step-3
// lets see the table and associated cluster keys
show tables like 't%_%_';
//Step-4
// lets see the record count
select count(*) from t1_no_cluster;
select count(*) from t2_order_dt;
select count(*) from t3_order_priority;
//Step-5
// now run a query against each table and see the behaviour
// we will use the timestamp as field so it does not use the cache
// you can also disable the catch using parameter setting at session level.
// we will also chose a different warehouse of same size...
//Step-6
// lets validate our 1st query in our default warehouse where order priority is medium.
select * from t1_no_cluster where O_ORDERPRIORITY = '3-MEDIUM' and current_timestamp() > '2020-01-01 08:08:49.426 -0800' limit 10;
select * from t2_order_dt where O_ORDERPRIORITY = '3-MEDIUM' and current_timestamp() > '2020-01-01 08:08:49.426 -0800' limit 10;
select * from t3_order_priority where O_ORDERPRIORITY = '3-MEDIUM' and current_timestamp() > '2020-01-01 08:08:49.426 -0800' limit 10;
//Step-7
// ===================
-- chose a different wh and resume it to make sure we focus only on query execution time
use warehouse wh_01;
alter warehouse wh_01 RESUME IF SUSPENDED;
select * from t1_no_cluster where O_ORDERPRIORITY = '3-MEDIUM' and current_timestamp() > '2020-01-02 08:08:49.426 -0800' limit 10;
-- has no cluster key
//Step-8
// ===================
-- chose a different wh and resume it to make sure we focus only on query execution time
use warehouse wh_02;
alter warehouse wh_02 RESUME IF SUSPENDED;
select * from t2_order_dt where O_ORDERPRIORITY = '3-MEDIUM' and current_timestamp() > '2020-01-01 08:08:49.426 -0800' limit 10;
-- has order dt as cluster key
//Step-9
// ===================
-- chose a different wh and resume it to make sure we focus only on query execution time
use warehouse wh_03;
alter warehouse wh_03 RESUME IF SUSPENDED;
select * from t3_order_priority where O_ORDERPRIORITY = '3-MEDIUM' and current_timestamp() > '2020-01-01 08:08:49.426 -0800' limit 10;
-- has order priority as cluster key
Alter Cluster Key
-- changing the cluster by key
-- t3_order_priority has order priority to order date
alter table t3_order_priority cluster by (O_ORDERDATE);
show tables;
-- drop cluster key
-- cluster by = clustering key (pay attention)
alter table t3_order_priority drop clustering key ;
-- recluster the table but it is deprecated
-- and it is unsupported feature
alter table t2_order_dt recluster ;
// ===================================
-- enables or disables Automatic Clustering for the table.
-- if you have cluster key and wants to disable automatic re-clustering
alter table t2_order_dt suspend recluster ;
-- if you have cluster key and wants to enable automatic re-clustering
alter table t2_order_dt resume recluster ;
alter table t1_no_cluster resume recluster ;
Clustering Depth
// t1 has no cluster key
select system$clustering_depth('t1_no_cluster');
//t2 has order dt as cluster key
select system$clustering_depth('t2_order_dt');
// t3 has a key but we dropped it.
select system$clustering_depth('t3_order_priority');
Clustering Ration - Deprecated
-- it is deprecated and replaced by
select system$clustering_ratio('t2_order_dt', '(o_orderdate)');
Clustering Histogram - Clustering Information
select system$clustering_information('t2_order_dt', '(o_orderdate)');