1. Loading unstrunctured data
- 스테이지 생성
- raw data 로드
- 분석 & 파싱
- 평면화 & 로드
2. Creating stage & raw file
- 스테이지 생성
CREATE OR REPLACE stage MANAGE_DB.EXTERNAL_STAGES.JSONSTAGE
url='s3://bucketsnowflake-jsondemo';
- 스테이지 확인
LIST @MANAGE_DB.EXTERNAL_STAGES.JSONSTAGE;
- file format 생성
CREATE OR REPLACE file format MANAGE_DB.FILE_FORMATS.JSONFORMAT
TYPE = JSON;
- variant 옵션을 사용하여 테이블 생성
CREATE OR REPLACE table OUR_FIRST_DB.PUBLIC.JSON_RAW (
raw_file variant);
- 데이터 복제
COPY INTO OUR_FIRST_DB.PUBLIC.JSON_RAW
FROM @MANAGE_DB.EXTERNAL_STAGES.JSONSTAGE
file_format= MANAGE_DB.FILE_FORMATS.JSONFORMAT
files = ('HR_data.json');
- 데이터 확인
SELECT * FROM OUR_FIRST_DB.PUBLIC.JSON_RAW;
3. Parsing JSON
- 속성을 명시하여 조회
SELECT RAW_FILE:city FROM OUR_FIRST_DB.PUBLIC.JSON_RAW;
- 열을 명시하여 조회
SELECT $1:first_name FROM OUR_FIRST_DB.PUBLIC.JSON_RAW;
- 속성의 형식을 명시하여 조회
SELECT RAW_FILE:first_name::string as first_name FROM OUR_FIRST_DB.PUBLIC.JSON_RAW;
SELECT RAW_FILE:id::int as id FROM OUR_FIRST_DB.PUBLIC.JSON_RAW;
- 종합
SELECT
RAW_FILE:id::int as id,
RAW_FILE:first_name::STRING as first_name,
RAW_FILE:last_name::STRING as last_name,
RAW_FILE:gender::STRING as gender
FROM OUR_FIRST_DB.PUBLIC.JSON_RAW;
4. Handling nested data
example 1 : nested data
- nested data 컬럼만 조회
SELECT RAW_FILE:job as job FROM OUR_FIRST_DB.PUBLIC.JSON_RAW;
- 속성과 속성의 형식을 명시하여 조회
SELECT
RAW_FILE:job.salary::INT as salary
FROM OUR_FIRST_DB.PUBLIC.JSON_RAW;
- 종합
SELECT
RAW_FILE:first_name::STRING as first_name,
RAW_FILE:job.salary::INT as salary,
RAW_FILE:job.title::STRING as title
FROM OUR_FIRST_DB.PUBLIC.JSON_RAW;
example 2 : arrays
- 배열 형식의 데이터 조회
SELECT
RAW_FILE:prev_company as prev_company
FROM OUR_FIRST_DB.PUBLIC.JSON_RAW;
- []를 넣어 원하는 데이터만 조회 - []안의 숫자가 0부터 첫 번째 순서로 조회함
SELECT
RAW_FILE:prev_company[1]::STRING as prev_company
FROM OUR_FIRST_DB.PUBLIC.JSON_RAW;
- ARRAY_SIZE 옵션을 통해 집계 - 특정 데이터만 볼 수 있다는 문제점이 있음
SELECT
ARRAY_SIZE(RAW_FILE:prev_company) as prev_company
FROM OUR_FIRST_DB.PUBLIC.JSON_RAW;
- UNION ALL을 통해 조회
SELECT
RAW_FILE:id::int as id,
RAW_FILE:first_name::STRING as first_name,
RAW_FILE:prev_company[0]::STRING as prev_company
FROM OUR_FIRST_DB.PUBLIC.JSON_RAW
UNION ALL
SELECT
RAW_FILE:id::int as id,
RAW_FILE:first_name::STRING as first_name,
RAW_FILE:prev_company[1]::STRING as prev_company
FROM OUR_FIRST_DB.PUBLIC.JSON_RAW
ORDER BY id;
5. Dealing with hierarchy
- 배열과 nested data가 동시에 있는 데이터 확인
SELECT
RAW_FILE:spoken_languages as spoken_languages
FROM OUR_FIRST_DB.PUBLIC.JSON_RAW;
- 배열에 몇개의 데이터가 있는지 확인
SELECT
array_size(RAW_FILE:spoken_languages) as spoken_languages
FROM OUR_FIRST_DB.PUBLIC.JSON_RAW;
- 이름과 결합된 배열 안의 데이터 개수 확인
SELECT
RAW_FILE:first_name::STRING as first_name,
array_size(RAW_FILE:spoken_languages) as spoken_languages
FROM OUR_FIRST_DB.PUBLIC.JSON_RAW;
- sponken_languages 배열 안의 첫 번째 데이터 확인
SELECT
RAW_FILE:spoken_languages[0] as First_language
FROM OUR_FIRST_DB.PUBLIC.JSON_RAW;
- 이름과 결합된 배열 안의 첫 번째 데이터 확인
SELECT
RAW_FILE:first_name::STRING as first_name,
RAW_FILE:spoken_languages[0] as First_language
FROM OUR_FIRST_DB.PUBLIC.JSON_RAW;
- 배열 안의 데이터를 각각의 컬럼으로 표현
SELECT
RAW_FILE:first_name::STRING as First_name,
RAW_FILE:spoken_languages[0].language::STRING as First_language,
RAW_FILE:spoken_languages[0].level::STRING as Level_spoken
FROM OUR_FIRST_DB.PUBLIC.JSON_RAW;
- UNION ALL 을 활용한 조회
SELECT
RAW_FILE:id::int as id,
RAW_FILE:first_name::STRING as First_name,
RAW_FILE:spoken_languages[0].language::STRING as First_language,
RAW_FILE:spoken_languages[0].level::STRING as Level_spoken
FROM OUR_FIRST_DB.PUBLIC.JSON_RAW
UNION ALL
SELECT
RAW_FILE:id::int as id,
RAW_FILE:first_name::STRING as First_name,
RAW_FILE:spoken_languages[1].language::STRING as First_language,
RAW_FILE:spoken_languages[1].level::STRING as Level_spoken
FROM OUR_FIRST_DB.PUBLIC.JSON_RAW
UNION ALL
SELECT
RAW_FILE:id::int as id,
RAW_FILE:first_name::STRING as First_name,
RAW_FILE:spoken_languages[2].language::STRING as First_language,
RAW_FILE:spoken_languages[2].level::STRING as Level_spoken
FROM OUR_FIRST_DB.PUBLIC.JSON_RAW
ORDER BY ID;
- f.value를 활용한 계층 데이터를 간단히 작업하는 방법
select
RAW_FILE:first_name::STRING as First_name,
f.value:language::STRING as First_language,
f.value:level::STRING as Level_spoken
from OUR_FIRST_DB.PUBLIC.JSON_RAW, table(flatten(RAW_FILE:spoken_languages)) f;
6. Insert final data
example 1 : CREATE TABLE AS 로 final data 생성
- CREATE OR REPLACE TABLE AS SELECT 로 새로운 테이블 생성
CREATE OR REPLACE TABLE Languages AS
select
RAW_FILE:first_name::STRING as First_name,
f.value:language::STRING as First_language,
f.value:level::STRING as Level_spoken
from OUR_FIRST_DB.PUBLIC.JSON_RAW, table(flatten(RAW_FILE:spoken_languages)) f;
- 생성한 테이블 확인
SELECT * FROM Languages;
- 생성한 테이블 truncate
truncate table languages;
example 2 : INSERT INTO 로 final data 생성
- INSERT INTO 로 생성되어 있는 테이블에 데이터 삽입
INSERT INTO Languages
select
RAW_FILE:first_name::STRING as First_name,
f.value:language::STRING as First_language,
f.value:level::STRING as Level_spoken
from OUR_FIRST_DB.PUBLIC.JSON_RAW, table(flatten(RAW_FILE:spoken_languages)) f;
- 삽입한 테이블 확인
SELECT * FROM Languages;
7. Querying PARQUET data
example 1 : file format과 스테이지 객체를 미리 만들어 놓고 사용
- file format 생성
CREATE OR REPLACE FILE FORMAT MANAGE_DB.FILE_FORMATS.PARQUET_FORMAT
TYPE = 'parquet';
- 스테이지 생성
CREATE OR REPLACE STAGE MANAGE_DB.EXTERNAL_STAGES.PARQUETSTAGE
url = 's3://snowflakeparquetdemo'
FILE_FORMAT = MANAGE_DB.FILE_FORMATS.PARQUET_FORMAT;
- 스테이지 확인
LIST @MANAGE_DB.EXTERNAL_STAGES.PARQUETSTAGE;
- parquet 데이터 확인
SELECT * FROM @MANAGE_DB.EXTERNAL_STAGES.PARQUETSTAGE;
example 2 : 쿼리 안에 file format이 포함됨
- 스테이지 생성
CREATE OR REPLACE STAGE MANAGE_DB.EXTERNAL_STAGES.PARQUETSTAGE
url = 's3://snowflakeparquetdemo' ;
- file format => 을 사용하여 쿼리 안에 file format을 명시
SELECT *
FROM @MANAGE_DB.EXTERNAL_STAGES.PARQUETSTAGE
(file_format => 'MANAGE_DB.FILE_FORMATS.PARQUET_FORMAT');
- 작은 따옴표 생략 가능
SELECT *
FROM @MANAGE_DB.EXTERNAL_STAGES.PARQUETSTAGE
(file_format => MANAGE_DB.FILE_FORMATS.PARQUET_FORMAT);
example 3 : 구조화되지 않은 데이터 쿼리 구문
- 스테이지 생성
CREATE OR REPLACE STAGE MANAGE_DB.EXTERNAL_STAGES.PARQUETSTAGE
url = 's3://snowflakeparquetdemo'
FILE_FORMAT = MANAGE_DB.FILE_FORMATS.PARQUET_FORMAT;
- 컬럼을 명시해서 조회
SELECT
$1:__index_level_0__,
$1:cat_id,
$1:date,
$1:"__index_level_0__",
$1:"cat_id",
$1:"d",
$1:"date",
$1:"dept_id",
$1:"id",
$1:"item_id",
$1:"state_id",
$1:"store_id",
$1:"value"
FROM @MANAGE_DB.EXTERNAL_STAGES.PARQUETSTAGE;
example 4 : 데이터 전환
- 데이터 전환과 alias를 사용해 조회
SELECT
$1:__index_level_0__::int as index_level,
$1:cat_id::VARCHAR(50) as category,
DATE($1:date::int ) as Date,
$1:"dept_id"::VARCHAR(50) as Dept_ID,
$1:"id"::VARCHAR(50) as ID,
$1:"item_id"::VARCHAR(50) as Item_ID,
$1:"state_id"::VARCHAR(50) as State_ID,
$1:"store_id"::VARCHAR(50) as Store_ID,
$1:"value"::int as value
FROM @MANAGE_DB.EXTERNAL_STAGES.PARQUETSTAGE;
8. Loading PARQUET data
- 메타데이터 추가
SELECT
$1:__index_level_0__::int as index_level,
$1:cat_id::VARCHAR(50) as category,
DATE($1:date::int ) as Date,
$1:"dept_id"::VARCHAR(50) as Dept_ID,
$1:"id"::VARCHAR(50) as ID,
$1:"item_id"::VARCHAR(50) as Item_ID,
$1:"state_id"::VARCHAR(50) as State_ID,
$1:"store_id"::VARCHAR(50) as Store_ID,
$1:"value"::int as value,
METADATA$FILENAME as FILENAME,
METADATA$FILE_ROW_NUMBER as ROWNUMBER,
TO_TIMESTAMP_NTZ(current_timestamp) as LOAD_DATE
FROM @MANAGE_DB.EXTERNAL_STAGES.PARQUETSTAGE;
- 데이터가 로드된 시각
SELECT TO_TIMESTAMP_NTZ(current_timestamp);
- 테이블 생성
CREATE OR REPLACE TABLE OUR_FIRST_DB.PUBLIC.PARQUET_DATA (
ROW_NUMBER int,
index_level int,
cat_id VARCHAR(50),
date date,
dept_id VARCHAR(50),
id VARCHAR(50),
item_id VARCHAR(50),
state_id VARCHAR(50),
store_id VARCHAR(50),
value int,
Load_date timestamp default TO_TIMESTAMP_NTZ(current_timestamp));
- parquet 데이터 로드
COPY INTO OUR_FIRST_DB.PUBLIC.PARQUET_DATA
FROM (SELECT
METADATA$FILE_ROW_NUMBER,
$1:__index_level_0__::int,
$1:cat_id::VARCHAR(50),
DATE($1:date::int ),
$1:"dept_id"::VARCHAR(50),
$1:"id"::VARCHAR(50),
$1:"item_id"::VARCHAR(50),
$1:"state_id"::VARCHAR(50),
$1:"store_id"::VARCHAR(50),
$1:"value"::int,
TO_TIMESTAMP_NTZ(current_timestamp)
FROM @MANAGE_DB.EXTERNAL_STAGES.PARQUETSTAGE);
- 데이터 확인
SELECT * FROM OUR_FIRST_DB.PUBLIC.PARQUET_DATA;
Udemy의 'Snowflake - The Complete Masterclass (2023 Edition)'를 공부한 내용을 바탕으로 작성하였습니다.
'Snowflake > Snowflake - The Complete Masterclass' 카테고리의 다른 글
[Snowflake] Loading from AWS (0) | 2023.02.06 |
---|---|
[Snowflake] Performance optimization (0) | 2023.02.03 |
[Snowflake] Copy options (0) | 2023.02.01 |
[Snowflake] Loading data (0) | 2023.01.30 |
[Snowflake] Snowflake Architecture (1) | 2023.01.27 |