Post

Mysql 실습 01

Mysql 실습 01
1
2
3
use analyze_db;

SELECT * FROM TB_POPLTN_DATA; # Data Warehouse
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
-- 컬럼 레벨에 집계의 기준이 있으면 그 데이터는 집계가 불가능!!!
-- 컬럼은 항목을 분류하기 위해 만드는 거지 !! 

INSERT INTO TB_POPLTN
SELECT A.ADMINIST_ZONE_NO, A.ADMINIST_ZONE_NM, A.STD_MT
     , CASE WHEN LVL1 = 1 THEN 'M' WHEN LVL1 = 2 THEN 'F' WHEN LVL1 = 3 THEN 'T' END AS POPLTN_SE_CD
     , CASE WHEN LVL2 = 1  THEN '000' WHEN LVL2 = 2  THEN '010' WHEN LVL2 = 3  THEN '020'
            WHEN LVL2 = 4  THEN '030' WHEN LVL2 = 5  THEN '040' WHEN LVL2 = 6  THEN '050'
            WHEN LVL2 = 7  THEN '060' WHEN LVL2 = 8  THEN '070' WHEN LVL2 = 9  THEN '080' WHEN LVL2 = 10 THEN '090' WHEN LVL2 = 11 THEN '100' END AS AGRDE_SE_CD
     , CASE WHEN LVL1 = 1 AND LVL2 = 1  THEN MALE_POPLTN_CO_0_9     WHEN LVL1 = 1 AND LVL2 = 2  THEN MALE_POPLTN_CO_10_19
            WHEN LVL1 = 1 AND LVL2 = 3  THEN MALE_POPLTN_CO_20_29   WHEN LVL1 = 1 AND LVL2 = 4  THEN MALE_POPLTN_CO_30_39
            WHEN LVL1 = 1 AND LVL2 = 5  THEN MALE_POPLTN_CO_40_49   WHEN LVL1 = 1 AND LVL2 = 6  THEN MALE_POPLTN_CO_50_59
            WHEN LVL1 = 1 AND LVL2 = 7  THEN MALE_POPLTN_CO_60_69   WHEN LVL1 = 1 AND LVL2 = 8  THEN MALE_POPLTN_CO_70_79
            WHEN LVL1 = 1 AND LVL2 = 9  THEN MALE_POPLTN_CO_80_89   WHEN LVL1 = 1 AND LVL2 = 10 THEN MALE_POPLTN_CO_90_99
            WHEN LVL1 = 1 AND LVL2 = 11 THEN MALE_POPLTN_CO_100     WHEN LVL1 = 2 AND LVL2 = 1  THEN FEMALE_POPLTN_CO_0_9
            WHEN LVL1 = 2 AND LVL2 = 2  THEN FEMALE_POPLTN_CO_10_19 WHEN LVL1 = 2 AND LVL2 = 3  THEN FEMALE_POPLTN_CO_20_29
            WHEN LVL1 = 2 AND LVL2 = 4  THEN FEMALE_POPLTN_CO_30_39 WHEN LVL1 = 2 AND LVL2 = 5  THEN FEMALE_POPLTN_CO_40_49
            WHEN LVL1 = 2 AND LVL2 = 6  THEN FEMALE_POPLTN_CO_50_59 WHEN LVL1 = 2 AND LVL2 = 7  THEN FEMALE_POPLTN_CO_60_69
            WHEN LVL1 = 2 AND LVL2 = 8  THEN FEMALE_POPLTN_CO_70_79 WHEN LVL1 = 2 AND LVL2 = 9  THEN FEMALE_POPLTN_CO_80_89
            WHEN LVL1 = 2 AND LVL2 = 10 THEN FEMALE_POPLTN_CO_90_99 WHEN LVL1 = 2 AND LVL2 = 11 THEN FEMALE_POPLTN_CO_100
            WHEN LVL1 = 3 AND LVL2 = 1  THEN POPLTN_CO_0_9          WHEN LVL1 = 3 AND LVL2 = 2  THEN POPLTN_CO_10_19
            WHEN LVL1 = 3 AND LVL2 = 3  THEN POPLTN_CO_20_29        WHEN LVL1 = 3 AND LVL2 = 4  THEN POPLTN_CO_30_39
            WHEN LVL1 = 3 AND LVL2 = 5  THEN POPLTN_CO_40_49        WHEN LVL1 = 3 AND LVL2 = 6  THEN POPLTN_CO_50_59
            WHEN LVL1 = 3 AND LVL2 = 7  THEN POPLTN_CO_60_69        WHEN LVL1 = 3 AND LVL2 = 8  THEN POPLTN_CO_70_79
            WHEN LVL1 = 3 AND LVL2 = 9  THEN POPLTN_CO_80_89        WHEN LVL1 = 3 AND LVL2 = 10 THEN POPLTN_CO_90_99
            WHEN LVL1 = 3 AND LVL2 = 11 THEN POPLTN_CO_100 END AS POPLTN_CNT
  FROM
     (
      SELECT SUBSTR(ADMINIST_ZONE, INSTR(ADMINIST_ZONE, '(') + 1, 10) AS ADMINIST_ZONE_NO
           , SUBSTR(ADMINIST_ZONE, 1, INSTR(ADMINIST_ZONE, '(')-1) AS ADMINIST_ZONE_NM,
             '202304' AS STD_MT
           , MALE_POPLTN_CO_0_9    , MALE_POPLTN_CO_10_19  , MALE_POPLTN_CO_20_29
           , MALE_POPLTN_CO_30_39  , MALE_POPLTN_CO_40_49  , MALE_POPLTN_CO_50_59
           , MALE_POPLTN_CO_60_69  , MALE_POPLTN_CO_70_79  , MALE_POPLTN_CO_80_89  , MALE_POPLTN_CO_90_99  , MALE_POPLTN_CO_100
           , FEMALE_POPLTN_CO_0_9  , FEMALE_POPLTN_CO_10_19, FEMALE_POPLTN_CO_20_29
           , FEMALE_POPLTN_CO_30_39, FEMALE_POPLTN_CO_40_49, FEMALE_POPLTN_CO_50_59
           , FEMALE_POPLTN_CO_60_69, FEMALE_POPLTN_CO_70_79, FEMALE_POPLTN_CO_80_89, FEMALE_POPLTN_CO_90_99, FEMALE_POPLTN_CO_100
           , POPLTN_CO_0_9         , POPLTN_CO_10_19, POPLTN_CO_20_29
           , POPLTN_CO_30_39       , POPLTN_CO_40_49, POPLTN_CO_50_59
           , POPLTN_CO_60_69       , POPLTN_CO_70_79, POPLTN_CO_80_89, POPLTN_CO_90_99, POPLTN_CO_100
           , LVL1, LVL2
        FROM TB_POPLTN_DATA,
		(SELECT (tmp1.idx) AS LVL1 
		FROM (SELECT 1 as idx UNION SELECT 2 UNION SELECT 3) tmp1) LVL1, (SELECT (tmp2.idx) AS LVL2 
		FROM (SELECT 1 as idx UNION SELECT 2 UNION SELECT 3 UNION SELECT 4 UNION SELECT 5 UNION SELECT 6 UNION SELECT 7 UNION SELECT 8 UNION SELECT 9 UNION SELECT 10 UNION SELECT 11) tmp2) LVL2
     ) A ;

-- UNION을 통해 레벨을 설정
-- 1~3 레벨 (LV1) : 성별을 의미 ( 남 / 여 / 전체)
-- 1~11 레벨 (LV2) : 나이대를 의미 

-- 카르테시안 JOIN
-- melting을 시켜야할 때 크로스조인이 필요하다.

UNION

  • UNION은 여러 개의 SELECT 결과를 “세로”로 집합하는 연산자

    1
    2
    3
    
      (SELECT (tmp1.idx) AS LVL1 
          FROM (SELECT 1 as idx UNION SELECT 2 UNION SELECT 3) tmp1) LVL1, (SELECT (tmp2.idx) AS LVL2 
          FROM (SELECT 1 as idx UNION SELECT 2 UNION SELECT 3 UNION SELECT 4 UNION SELECT 5 UNION SELECT 6 UNION SELECT 7 UNION SELECT 8 UNION SELECT 9 UNION SELECT 10 UNION SELECT 11) tmp2) LVL2
    

    위 코드 해석

gpt는 멋쟁이 선생님이에오,,,

1
2
3
SELECT * FROM TB_POPLTN;

SELECT * FROM TB_POPLTN WHERE ADMINIST_ZONE_NM LIKE '대구%'
1
2
3
4
5
6
7
8
9
10
11
-- 전체 인구의 연령대별 비율 구하기
-- 2023년 4월 기준 전국 시/도 기준 연령대별 인구수 합계
SELECT 
	AGRDE_SE_CD,
	SUM(POPLTN_CNT) AS AGRDE_POPLTN_CNT
FROM TB_POPLTN
WHERE STD_MT='202304'
	AND ADMINIST_ZONE_NO LIKE '__00000000'
	AND POPLTN_SE_CD = 'T'
GROUP BY AGRDE_SE_CD
ORDER BY AGRDE_SE_CD;
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
-- 2023년 4월 기준 전국 주요 시/도의 연령대별 인구수를 구하고, 연령대별 인구 비율 

SELECT 
	A.AGRDE_SE_CD,
	A.AGRDE_POPLTN_CNT,
	A.AGRDE_POPLTN_CNT / SUM(A.AGRDE_SE_CD) OVER() AS AGRDE_POPLTN_RATE
FROM (
	SELECT 
		AGRDE_SE_CD,
		SUM(POPLTN_CNT) AS AGRDE_POPLTN_CNT
	FROM TB_POPLTN
	WHERE STD_MT='202304'
		AND ADMINIST_ZONE_NO LIKE '__00000000'
		AND POPLTN_SE_CD = 'T'
	GROUP BY AGRDE_SE_CD
	ORDER BY AGRDE_SE_CD) A;
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
-- 서브 쿼리 대신 WITH ~ AS 구문 사용하기
-- WITH AS 구문 : 서브쿼리에서 조회할 결과를 미리 임시 테이블 형식으로 만들어 놓는 방법,
-- 				임시 테이블이기 때문에 1회성, 한번 사용하면 사라진다. VIEW와 다름 !!! 
-- 반드시!!! SELECT 절과 함께~~~
WITH TEMP_AGRDE_POPLTN_CNT AS(
	SELECT 
		AGRDE_SE_CD,
		SUM(POPLTN_CNT) AS AGRDE_POPLTN_CNT
	FROM TB_POPLTN
	WHERE STD_MT='202304'
		AND ADMINIST_ZONE_NO LIKE '__00000000'
		AND POPLTN_SE_CD = 'T'
	GROUP BY AGRDE_SE_CD
	ORDER BY AGRDE_SE_CD
)
SELECT 
	AGRDE_SE_CD,
	AGRDE_POPLTN_CNT,
	AGRDE_POPLTN_CNT / SUM(AGRDE_POPLTN_CNT) OVER () AGRDE_POPLTN_RATE
FROM TEMP_AGRDE_POPLTN_CNT;
1
2
3
4
5
6
7
8
9
-- 2023년 4월 기준 전국 주요 시/도의 성별(M, F) 인구수 합계
SELECT
	POPLTN_SE_CD,
	SUM(POPLTN_CNT) AS SE_POPLTN_CNT
FROM TB_POPLTN
WHERE STD_MT = '202304'
	AND ADMINIST_ZONE_NO LIKE '__00000000'
	AND POPLTN_SE_CD IN ('M', 'F')
GROUP BY POPLTN_SE_CD;
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
-- 남성 / 여성 비율 구하기
-- 남자 : 여자 비율 구하기
-- 남자 / 전체
-- 여자 / 전체 

-- colum 레벨에서 성별을 선택할 수 있게 만들기 위해 
-- CASE WHEN THEN을 이용해서 Pivoting을 수행한다.
-- 왜? 남자, 여자 정보를 SELECT 해야 하기 때문!! 

SELECT
	POPLTN_SE_CD,
	CASE 
		WHEN POPLTN_SE_CD = 'M' THEN SUM(POPLTN_CNT)
		ELSE 0
	END AS MALE_POPLTN_CNT,
	CASE 
		WHEN POPLTN_SE_CD = 'F' THEN SUM(POPLTN_CNT)
		ELSE 0
	END AS FEMALE_POPLTN_CNT
FROM TB_POPLTN
WHERE STD_MT = '202304'
	AND ADMINIST_ZONE_NO LIKE '__00000000'
	AND POPLTN_SE_CD IN ('M', 'F')
GROUP BY POPLTN_SE_CD; 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
-- 표에 더이상 필요없는 F,M 없애기
WITH TEMP_SE_POPLTN_CNT AS (
	SELECT
	POPLTN_SE_CD,
	CASE WHEN POPLTN_SE_CD = 'M' THEN SUM(POPLTN_CNT) ELSE 0 END AS MALE_POPLTN_CNT,
	CASE WHEN POPLTN_SE_CD = 'F' THEN SUM(POPLTN_CNT) ELSE 0 END AS FEMALE_POPLTN_CNT
FROM TB_POPLTN
WHERE STD_MT = '202304'
	AND ADMINIST_ZONE_NO LIKE '__00000000'
	AND POPLTN_SE_CD IN ('M', 'F')
GROUP BY POPLTN_SE_CD 
), TEMP_PIVOT_POPLTN_CNT AS (
	SELECT
		MAX(MALE_POPLTN_CNT) AS MALE_POPLTN_CNT,
		MAX(FEMALE_POPLTN_CNT) AS FEMALE_POPLTN_CNT
	FROM TEMP_SE_POPLTN_CNT
)SELECT 
	MALE_POPLTN_CNT, FEMALE_POPLTN_CNT,
	MALE_POPLTN_CNT / FEMALE_POPLTN_CNT AS "남성/여성 비율",
	MALE_POPLTN_CNT / (MALE_POPLTN_CNT + FEMALE_POPLTN_CNT ) AS "전체인구수대비남성비율",
	FEMALE_POPLTN_CNT / (MALE_POPLTN_CNT + FEMALE_POPLTN_CNT ) AS "전체인구수대비여성비율"
FROM TEMP_PIVOT_POPLTN_CNT;
-- -- -- -- -- -- -- --
-- -- 25-02-06-THU -- --

-- 연령대 별 인구가 가장 많은 지역 구하기
-- 2023년 4월 기준 전국의 읍/면/동별 전체 인구수 조회

USE analyze_db; 

SELECT
    AGRDE_SE_CD,
    ADMINIST_ZONE_NO,
    ADMINIST_ZONE_NM,
    POPLTN_CNT
FROM TB_POPLTN
WHERE POPLTN_SE_CD = 'T'
  AND STD_MT = '202304'
  AND POPLTN_CNT > 0
  AND ADMINIST_ZONE_NO NOT LIKE '_____00000'
ORDER BY POPLTN_CNT DESC
-- 내림차순으로 rank 먹여서 1등인 곳이 어쨌든 최대값이겠지? 
-- 집계 대상 값!이 중요하면 GROUP BY를 사용하는 것이고,
-- 우리가 보고자 하는 건 가장 많은 동네!!가 중요한 거기 때문에
-- window function 을 쓰는 게 더 낫다. 
WITH TEMP_EMD_POPLTN_CNT AS(
    SELECT
        AGRDE_SE_CD, -- 연령대별
        ADMINIST_ZONE_NO, -- 시군구 번호
        ADMINIST_ZONE_NM, -- 시군구 이름
        POPLTN_CNT -- 인구 총합
    FROM TB_POPLTN
    WHERE POPLTN_SE_CD = 'T'
      AND STD_MT = '202304'
      AND POPLTN_CNT > 0
      AND ADMINIST_ZONE_NO NOT LIKE '_____00000'
    ORDER BY POPLTN_CNT DESC
), TEMP_AGRDE_POPLTN_RANK AS(
    SELECT
        AGRDE_SE_CD,
        ADMINIST_ZONE_NO,
        ADMINIST_ZONE_NM,
        POPLTN_CNT,
        RANK() OVER(PARTITION BY AGRDE_SE_CD ORDER BY POPLTN_CNT DESC) AS POPLTN_RANK
    FROM TEMP_EMD_POPLTN_CNT
)
SELECT *
FROM TEMP_AGRDE_POPLTN_RANK
WHERE POPLTN_RANK = 1
ORDER BY AGRDE_SE_CD;

-- #############################
-- 연령대 별 인구 비율이 가장 높은 지역 찾기
--  동네 별 연령대 비율 구하기 -> 그 비율이 연령대 별로 1위인 지역만 구하기 

WITH TEMP_EMD_POPLTN_CNT AS(
	SELECT
	    AGRDE_SE_CD,
	    ADMINIST_ZONE_NO,
	    ADMINIST_ZONE_NM,
	    POPLTN_CNT
	FROM TB_POPLTN
	WHERE POPLTN_SE_CD = 'T'
	  AND STD_MT = '202304'
	  AND POPLTN_CNT > 0
	  AND ADMINIST_ZONE_NO NOT LIKE '_____00000'
	ORDER BY POPLTN_CNT DESC
), TEMP_AGRDE_RATE AS(
	SELECT
		*,
		POPLTN_CNT / SUM(POPLTN_CNT) OVER(PARTITION BY ADMINIST_ZONE_NO) AS "지역별연령대비율"
	FROM TEMP_EMD_POPLTN_CNT
), TEMP_AGRDE_RATE_RANK AS(
	SELECT
		*,
		RANK() OVER(PARTITION BY AGRDE_SE_CD ORDER BY 지역별연령대비율 DESC) AS POPLTN_RATE_RANK
	FROM TEMP_AGRDE_RATE
)
SELECT *
FROM TEMP_AGRDE_RATE_RANK 
WHERE POPLTN_RATE_RANK = 1
ORDER BY AGRDE_SE_CD;

이해 해보자

-- group by -> 숫자가 중요, max, min
-- rank -> 숫자보단 주변 정보(동네이름)

-- #####################################################
-- ############# 이정도 풀줄 알면 충분함....... ###############
-- #####################################################

-- 숙제 1) 2023년 4월 전국의 각 읍/면/동 기준 남성의 수보다 여성의 수가 많은 지역 기준 내림차순

SHOW tables

WITH
    TEMP_SE_REDUCE AS (
        SELECT ADMINIST_ZONE_NO,
               ADMINIST_ZONE_NM,
               POPLTN_SE_CD,
               SUM(POPLTN_CNT) AS POPLTN_CNT
        FROM TB_POPLTN
        WHERE ADMINIST_ZONE_NO NOT LIKE '_____00000'
          AND POPLTN_SE_CD IN ('M', 'F')
          AND STD_MT = '202304'
        GROUP BY ADMINIST_ZONE_NO, ADMINIST_ZONE_NM, POPLTN_SE_CD
    ),
    TEMP_SE_PIVOT AS(
        SELECT
            ADMINIST_ZONE_NO,
            ADMINIST_ZONE_NM,
            IF(POPLTN_SE_CD = 'M', POPLTN_CNT, 0) AS MALE_POPLTN_CNT,
            IF(POPLTN_SE_CD = 'F', POPLTN_CNT, 0) AS FEMALE_POPLTN_CNT
        FROM TEMP_SE_REDUCE
    ),
    TEMP_ADMINIST_PIVOT AS (
        SELECT
            ADMINIST_ZONE_NO, ADMINIST_ZONE_NM,
            MAX(MALE_POPLTN_CNT) AS MALE_POPLTN_CNT,
            MAX(FEMALE_POPLTN_CNT) AS FEMALE_POPLTN_CNT
        FROM TEMP_SE_PIVOT
        GROUP BY ADMINIST_ZONE_NO, ADMINIST_ZONE_NM
    )
SELECT
    ADMINIST_ZONE_NO,
    ADMINIST_ZONE_NM,
    FEMALE_POPLTN_CNT,
    MALE_POPLTN_CNT,
    FEMALE_POPLTN_CNT - MALE_POPLTN_CNT AS FEMALE_MALE_DIFF
FROM TEMP_ADMINIST_PIVOT
ORDER BY FEMALE_MALE_DIFF DESC
LIMIT 10;
-- 숙제 2)
-- 남성/여성 비율이 가장 높은 지역과 가장 낮은 지역 구하기
--  전국의 각 읍/면/동 기준 남성 비율 및 여성 비율이 가장 높거나 낮은 지역 구하기.
--   비율 : A라는 동네에 남성이 100명, 여성이 130명이면 남성 비율 : 100/(100+130) 여성 비율 : 130 / (100+130)

WITH TEMP_POPLTN_CNT AS (
    SELECT
        A.ADMINIST_ZONE_NO,
        A.ADMINIST_ZONE_NM,
        A.POPLTN_SE_CD,
        SUM(A.POPLTN_CNT) AS POPLTN_CNT
    FROM TB_POPLTN A
    WHERE A.ADMINIST_ZONE_NO NOT LIKE '_____00000'
      AND A.POPLTN_SE_CD IN ('M', 'F')
      AND A.STD_MT = '202304'
      AND A.POPLTN_CNT > 0
    GROUP BY A.ADMINIST_ZONE_NO, A.ADMINIST_ZONE_NM, A.POPLTN_SE_CD
    ORDER BY A.ADMINIST_ZONE_NO, A.POPLTN_SE_CD
), TEMP_SE_CNT_PIVOT AS (
    SELECT ADMINIST_ZONE_NO,
           ADMINIST_ZONE_NM,
           CASE WHEN POPLTN_SE_CD = 'M' THEN POPLTN_CNT ELSE 0 END AS MALE_POPLTN_CNT,
           CASE WHEN POPLTN_SE_CD = 'F' THEN POPLTN_CNT ELSE 0 END AS FEMALE_POPLTN_CNT
    FROM TEMP_POPLTN_CNT
), TEMP_SE_CNT_PIVOT_REDUCE AS (
    SELECT ADMINIST_ZONE_NO,
           ADMINIST_ZONE_NM,
           MAX(MALE_POPLTN_CNT) AS MALE_POPLTN_CNT,
           MAX(FEMALE_POPLTN_CNT) AS FEMALE_POPLTN_CNT,
           MAX(MALE_POPLTN_CNT) + MAX(FEMALE_POPLTN_CNT) AS TOT_POPLTN_CNT
    FROM TEMP_SE_CNT_PIVOT
    GROUP BY ADMINIST_ZONE_NO, ADMINIST_ZONE_NM
), TEMP_SE_CNT_RATE AS (
    SELECT
        ADMINIST_ZONE_NO,
        ADMINIST_ZONE_NM,
        MALE_POPLTN_CNT,
        FEMALE_POPLTN_CNT,
        MALE_POPLTN_CNT / TOT_POPLTN_CNT AS "남성인구비율",
        FEMALE_POPLTN_CNT / TOT_POPLTN_CNT AS "여성인구비율",
        RANK() OVER( ORDER BY MALE_POPLTN_CNT / TOT_POPLTN_CNT ) AS MALE_RATE_ASC, -- 남성 인구 비율 오름차순 랭킹
        RANK() OVER( ORDER BY FEMALE_POPLTN_CNT / TOT_POPLTN_CNT ) AS FEMALE_RATE_ASC, -- 여성 인구 비율 오름차순 랭킹
        RANK() OVER( ORDER BY MALE_POPLTN_CNT / TOT_POPLTN_CNT DESC) AS MALE_RATE_DESC, -- 남성 인구 비율 내림차순 랭킹
        RANK() OVER( ORDER BY FEMALE_POPLTN_CNT / TOT_POPLTN_CNT DESC) AS FEMALE_RATE_DESC -- 여성 인구 비율 내림차순 랭킹
    FROM TEMP_SE_CNT_PIVOT_REDUCE
)
SELECT
    ADMINIST_ZONE_NO,
    ADMINIST_ZONE_NM,
    MALE_POPLTN_CNT,
    FEMALE_POPLTN_CNT,
    남성인구비율,
    여성인구비율
FROM TEMP_SE_CNT_RATE
WHERE MALE_RATE_ASC = 1 OR FEMALE_RATE_ASC = 1 OR MALE_RATE_DESC = 1 OR FEMALE_RATE_DESC = 1;

📌✅1️⃣🎯

This post is licensed under CC BY 4.0 by the author.