6 Wed
TIL
[프로그래머스 AI 스쿨 1기] EDA Project
Jupyter Notebook으로 csv 분석
kaggle에서 찾은 korean health로 데이터 주제를 선택했다. 흡연이나 음주, 가족력등으로 질병을 파악하는 데이터셋으로 보여서 상관관계가 있을 것으로 예상했다.
https://www.kaggle.com/junsoopablo/korean-genome-and-epidemiology-study-koges
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import os, glob
mpl.rcParams['axes.unicode_minus'] = False
healths = [pd.read_csv(os.path.join("archive", "follow_0"+str(i)+"_data.csv")) for i in range(1, 6)]
h = pd.DataFrame()
for name in healths[0].columns:
h[name[4:]] = healths[0][name]
for idx in range(1, 5):
other = pd.DataFrame()
for name in healths[idx].columns:
other[name[4:]] = healths[idx][name]
h = pd.concat([h, other], axis=0, ignore_index=True)
h
ID
DATA_CLASS
EDATE
SEX
AGE
EDU
MARRY
DRINK
DRDU
TAKFQ
...
DBP
HBA1C
GLU0
CREATININE
AST
ALT
TCHL
HDL
TG
INS0
0
K_FOLLOW_0001
F05
200412
1.0
56
1.0
2
3
4.0
0
...
80
5.6
82
0.7
30
38
154
35
126
6.7
1
K_FOLLOW_0002
F19
200401
1.0
40
3.0
2
3
4.0
0
...
118
5.5
130
1.1
46
75
214
44
169
4.3
2
K_FOLLOW_0003
F05
200309
1.0
52
2.0
2
2
3.0
0
...
90
5.2
83
0.9
29
45
130
27
134
7.9
3
K_FOLLOW_0004
F05
200504
2.0
60
2.0
2
1
77777.0
0
...
90
5.6
89
0.6
35
34
182
47
123
12.6
4
K_FOLLOW_0005
F19
200402
1.0
49
3.0
2
3
4.0
0
...
79
6.9
95
1.0
52
33
203
36
277
2.7
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
4530
K_FOLLOW_0993
F05
201505
NaN
74
NaN
3
2
NaN
0
...
63
6.2
111
0.7
25
22
173
40
79
29.1
4531
K_FOLLOW_0995
F19
201412
NaN
55
NaN
2
3
NaN
0
...
69
5.3
94
1.0
27
23
221
63
134
5.8
4532
K_FOLLOW_0996
F05
201605
NaN
75
NaN
2
1
NaN
0
...
76
5.7
111
1.5
26
18
199
34
129
10.9
4533
K_FOLLOW_0998
F19
201410
NaN
52
NaN
2
3
NaN
0
...
76
4.9
93
0.8
22
17
269
41
141
11.8
4534
K_FOLLOW_1000
F19
201410
NaN
56
NaN
2
1
NaN
0
...
71
6.0
84
0.8
22
28
246
37
282
17.0
4535 rows × 64 columns
# HTN : Hypertension, 고혈압
# DM : diabetes, 당뇨
# LIP : Hyperlipidemia, 고지질혈증
h.columns
Index(['ID', 'DATA_CLASS', 'EDATE', 'SEX', 'AGE', 'EDU', 'MARRY', 'DRINK',
'DRDU', 'TAKFQ', 'TAKAM', 'RICEFQ', 'RICEAM', 'WINEFQ', 'WINEAM',
'SOJUFQ', 'SOJUAM', 'BEERFQ', 'BEERAM', 'HLIQFQ', 'HLIQAM', 'SMOKE',
'SMAG', 'SMDU', 'SMAM', 'PSM', 'EXER', 'HTN', 'HTNAG', 'DM', 'DMAG',
'LIP', 'LIPAG', 'FMFHT', 'FMFHTAG', 'FMMHT', 'FMMHTAG', 'FMFDM',
'FMFDMAG', 'FMMDM', 'FMMDMAG', 'MNSAG', 'PREG', 'FPREGAG', 'CHILD',
'FLABAG', 'PMYN_C', 'PMAG_C', 'HEIGHT', 'WEIGHT', 'WAIST', 'HIP',
'PULSE', 'SBP', 'DBP', 'HBA1C', 'GLU0', 'CREATININE', 'AST', 'ALT',
'TCHL', 'HDL', 'TG', 'INS0'],
dtype='object')
concern_column=["ID", "DATA_CLASS", 'EDATE', 'SEX', 'AGE', 'EDU' ,'MARRY', 'DRINK', 'DRDU', 'SMOKE', 'SMDU', 'EXER', 'HTN', 'DM', 'LIP',
'WEIGHT', 'HEIGHT']
df = h[concern_column]
df
ID
DATA_CLASS
EDATE
SEX
AGE
EDU
MARRY
DRINK
DRDU
SMOKE
SMDU
EXER
HTN
DM
LIP
WEIGHT
HEIGHT
0
K_FOLLOW_0001
F05
200412
1.0
56
1.0
2
3
4.0
3
40.0
1
1
1
1
50
159
1
K_FOLLOW_0002
F19
200401
1.0
40
3.0
2
3
4.0
2
20.0
1
1
1
1
94
169
2
K_FOLLOW_0003
F05
200309
1.0
52
2.0
2
2
3.0
2
15.0
1
2
1
1
63
165
3
K_FOLLOW_0004
F05
200504
2.0
60
2.0
2
1
77777.0
1
77777.0
1
1
1
1
70
165
4
K_FOLLOW_0005
F19
200402
1.0
49
3.0
2
3
4.0
3
20.0
1
1
1
1
69
166
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
4530
K_FOLLOW_0993
F05
201505
NaN
74
NaN
3
2
NaN
2
15.0
1
1
1
1
53
154
4531
K_FOLLOW_0995
F19
201412
NaN
55
NaN
2
3
NaN
2
17.0
2
1
1
1
67
169
4532
K_FOLLOW_0996
F05
201605
NaN
75
NaN
2
1
NaN
3
52.0
1
1
1
1
57
161
4533
K_FOLLOW_0998
F19
201410
NaN
52
NaN
2
3
NaN
3
28.0
1
1
1
1
76
178
4534
K_FOLLOW_1000
F19
201410
NaN
56
NaN
2
1
NaN
1
77777.0
1
1
1
1
72
153
4535 rows × 17 columns
df.describe()
EDATE
SEX
AGE
EDU
MARRY
DRINK
DRDU
SMOKE
SMDU
EXER
HTN
DM
LIP
WEIGHT
HEIGHT
count
4535.000000
1000.000000
4535.000000
1000.000000
4535.000000
4535.000000
1000.000000
4535.000000
4535.000000
4535.000000
4535.000000
4535.000000
4535.000000
4535.000000
4535.000000
mean
200871.500331
1.558000
58.764057
702.283000
5272.043219
332.727233
41478.984000
332.229327
54322.767343
199.826461
155.441235
155.384344
155.389636
216.592282
313.328776
std
387.311708
0.496873
9.670914
8341.161821
18212.583546
5742.123427
39837.974579
5742.152079
36072.751235
4450.807540
3926.121652
3926.123883
3926.123675
3923.730540
3919.923804
min
200301.000000
1.000000
40.000000
1.000000
1.000000
1.000000
1.000000
1.000000
0.100000
1.000000
1.000000
1.000000
1.000000
36.000000
132.000000
25%
200509.000000
1.000000
51.000000
1.000000
2.000000
1.000000
4.000000
1.000000
36.000000
1.000000
1.000000
1.000000
1.000000
55.000000
152.000000
50%
200811.000000
2.000000
58.000000
2.000000
2.000000
2.000000
77777.000000
1.000000
77777.000000
1.000000
1.000000
1.000000
1.000000
61.000000
159.000000
75%
201207.000000
2.000000
66.000000
3.000000
2.000000
3.000000
77777.000000
2.000000
77777.000000
2.000000
1.000000
1.000000
1.000000
69.000000
166.000000
max
201702.000000
2.000000
86.000000
99999.000000
99999.000000
99999.000000
99999.000000
99999.000000
99999.000000
99999.000000
99999.000000
99999.000000
99999.000000
99999.000000
99999.000000
update_lists = ['EDU', 'MARRY', 'DRINK', 'DRDU', 'SMOKE', 'SMDU', 'EXER', 'HTN', 'DM', 'LIP', 'WEIGHT', 'HEIGHT']
for u_list in update_lists:
df[u_list] = df[u_list].replace(99999.0, df[u_list].quantile(0.25))
df[u_list] = df[u_list].replace(77777.0, df[u_list].quantile(0.30))
df[u_list] = df[u_list].replace(66666.0, df[u_list].quantile(0.35))
df.describe()
EDATE
SEX
AGE
EDU
MARRY
DRINK
DRDU
SMOKE
SMDU
EXER
HTN
DM
LIP
WEIGHT
HEIGHT
count
4535.000000
1000.000000
4535.000000
1000.000000
4535.000000
4535.000000
1000.000000
4535.000000
4535.000000
4535.000000
4535.000000
4535.000000
4535.000000
4535.000000
4535.000000
mean
200871.500331
1.558000
58.764057
2.297000
2.130320
1.973098
3.713000
1.475193
40.015854
1.373980
1.089305
1.032415
1.037707
62.323705
159.209923
std
387.311708
0.496873
9.670914
1.294011
0.454352
0.980705
0.723294
0.755944
12.677512
0.483912
0.285216
0.177118
0.190507
10.306152
9.177130
min
200301.000000
1.000000
40.000000
1.000000
1.000000
1.000000
1.000000
1.000000
0.100000
1.000000
1.000000
1.000000
1.000000
36.000000
132.000000
25%
200509.000000
1.000000
51.000000
1.000000
2.000000
1.000000
4.000000
1.000000
36.000000
1.000000
1.000000
1.000000
1.000000
55.000000
152.000000
50%
200811.000000
2.000000
58.000000
2.000000
2.000000
2.000000
4.000000
1.000000
47.000000
1.000000
1.000000
1.000000
1.000000
61.000000
159.000000
75%
201207.000000
2.000000
66.000000
3.000000
2.000000
3.000000
4.000000
2.000000
47.000000
2.000000
1.000000
1.000000
1.000000
69.000000
166.000000
max
201702.000000
2.000000
86.000000
6.000000
6.000000
3.000000
4.000000
3.000000
61.000000
2.000000
2.000000
2.000000
2.000000
110.000000
186.000000
df['BMI'] = df['WEIGHT'] * 10000 / (df['HEIGHT'] ** 2)
df.describe()
c:\users\32154049\appdata\local\programs\python\python37\lib\site-packages\ipykernel_launcher.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
"""Entry point for launching an IPython kernel.
EDATE
SEX
AGE
EDU
MARRY
DRINK
DRDU
SMOKE
SMDU
EXER
HTN
DM
LIP
WEIGHT
HEIGHT
BMI
count
4535.000000
1000.000000
4535.000000
1000.000000
4535.000000
4535.000000
1000.000000
4535.000000
4535.000000
4535.000000
4535.000000
4535.000000
4535.000000
4535.000000
4535.000000
4535.000000
mean
200871.500331
1.558000
58.764057
2.297000
2.130320
1.973098
3.713000
1.475193
40.015854
1.373980
1.089305
1.032415
1.037707
62.323705
159.209923
24.520747
std
387.311708
0.496873
9.670914
1.294011
0.454352
0.980705
0.723294
0.755944
12.677512
0.483912
0.285216
0.177118
0.190507
10.306152
9.177130
3.025413
min
200301.000000
1.000000
40.000000
1.000000
1.000000
1.000000
1.000000
1.000000
0.100000
1.000000
1.000000
1.000000
1.000000
36.000000
132.000000
16.649324
25%
200509.000000
1.000000
51.000000
1.000000
2.000000
1.000000
4.000000
1.000000
36.000000
1.000000
1.000000
1.000000
1.000000
55.000000
152.000000
22.460034
50%
200811.000000
2.000000
58.000000
2.000000
2.000000
2.000000
4.000000
1.000000
47.000000
1.000000
1.000000
1.000000
1.000000
61.000000
159.000000
24.341758
75%
201207.000000
2.000000
66.000000
3.000000
2.000000
3.000000
4.000000
2.000000
47.000000
2.000000
1.000000
1.000000
1.000000
69.000000
166.000000
26.374068
max
201702.000000
2.000000
86.000000
6.000000
6.000000
3.000000
4.000000
3.000000
61.000000
2.000000
2.000000
2.000000
2.000000
110.000000
186.000000
40.898275
sex = df['SEX'].value_counts()
sex
2.0 558
1.0 442
Name: SEX, dtype: int64
sns.countplot(x='SEX', data=df, palette="Set3").set_xticklabels(['Male', 'Female'])
[Text(0,0,'Male'), Text(0,0,'Female')]

age = (df['AGE'] // 10).value_counts().sort_index()
age
4 916
5 1500
6 1420
7 665
8 34
Name: AGE, dtype: int64
age.plot.pie(labels=['40s', '50s', '60s', '70s', '80s'], figsize=(6,6), autopct='%2.2f%%')
<matplotlib.axes._subplots.AxesSubplot at 0x26e3b729248>

df.corr()
EDATE
SEX
AGE
EDU
MARRY
DRINK
DRDU
SMOKE
SMDU
EXER
HTN
DM
LIP
WEIGHT
HEIGHT
BMI
EDATE
1.000000
0.008615
0.366888
-0.031471
0.045600
-0.020274
0.117783
-0.044139
0.061567
0.078942
-0.169604
-0.089983
-0.011097
-0.028595
-0.019003
-0.018632
SEX
0.008615
1.000000
0.095919
-0.402801
0.105127
-0.508050
-0.021876
-0.711529
0.625547
-0.024109
0.047184
-0.024149
-0.027353
-0.481835
-0.754677
0.056700
AGE
0.366888
0.095919
1.000000
-0.413910
0.179970
-0.201568
0.209637
-0.135777
0.220699
-0.086049
0.043383
0.041549
0.035596
-0.218719
-0.310716
0.004590
EDU
-0.031471
-0.402801
-0.413910
1.000000
-0.035169
0.265857
-0.071401
0.269781
-0.358991
0.185560
-0.073431
-0.014515
0.036760
0.277198
0.466428
-0.066283
MARRY
0.045600
0.105127
0.179970
-0.035169
1.000000
-0.044103
-0.000401
-0.065396
0.056706
-0.010054
-0.004731
-0.000430
0.014563
-0.099021
-0.152237
0.009060
DRINK
-0.020274
-0.508050
-0.201568
0.265857
-0.044103
1.000000
-0.362018
0.378713
-0.329277
0.061637
-0.019007
-0.010216
-0.031165
0.299947
0.425832
-0.001447
DRDU
0.117783
-0.021876
0.209637
-0.071401
-0.000401
-0.362018
1.000000
0.028359
0.038567
-0.002015
0.117477
0.081809
0.055598
-0.028925
-0.034134
-0.005400
SMOKE
-0.044139
-0.711529
-0.135777
0.269781
-0.065396
0.378713
0.028359
1.000000
-0.701374
-0.024676
-0.009670
0.018362
-0.034088
0.294119
0.502751
-0.079150
SMDU
0.061567
0.625547
0.220699
-0.358991
0.056706
-0.329277
0.038567
-0.701374
1.000000
-0.053050
0.022635
-0.002213
0.020400
-0.331867
-0.464291
0.001259
EXER
0.078942
-0.024109
-0.086049
0.185560
-0.010054
0.061637
-0.002015
-0.024676
-0.053050
1.000000
-0.045482
0.020650
0.026435
0.153191
0.151872
0.059174
HTN
-0.169604
0.047184
0.043383
-0.073431
-0.004731
-0.019007
0.117477
-0.009670
0.022635
-0.045482
1.000000
0.134787
0.088200
0.060619
-0.038763
0.119192
DM
-0.089983
-0.024149
0.041549
-0.014515
-0.000430
-0.010216
0.081809
0.018362
-0.002213
0.020650
0.134787
1.000000
0.087962
0.065900
0.003683
0.083178
LIP
-0.011097
-0.027353
0.035596
0.036760
0.014563
-0.031165
0.055598
-0.034088
0.020400
0.026435
0.088200
0.087962
1.000000
-0.002062
-0.039725
0.033544
WEIGHT
-0.028595
-0.481835
-0.218719
0.277198
-0.099021
0.299947
-0.028925
0.294119
-0.331867
0.153191
0.060619
0.065900
-0.002062
1.000000
0.671009
0.705170
HEIGHT
-0.019003
-0.754677
-0.310716
0.466428
-0.152237
0.425832
-0.034134
0.502751
-0.464291
0.151872
-0.038763
0.003683
-0.039725
0.671009
1.000000
-0.045135
BMI
-0.018632
0.056700
0.004590
-0.066283
0.009060
-0.001447
-0.005400
-0.079150
0.001259
0.059174
0.119192
0.083178
0.033544
0.705170
-0.045135
1.000000
sns.heatmap(df.corr())
<matplotlib.axes._subplots.AxesSubplot at 0x26e3b76ffc8>

fig, ax = plt.subplots(1, 1, figsize=(10, 5)) # 가로 1개, 세로 1개, figsize
sns.kdeplot(x=df['EDU'], ax=ax)
sns.kdeplot(x=df[df.HTN == 2]['EDU'], ax=ax)
sns.kdeplot(x=df[df.DM == 2]['EDU'], ax=ax)
sns.kdeplot(x=df[df.LIP == 2]['EDU'], ax=ax)
plt.legend(['EDU', 'HTN', 'DM', 'LIP'])
plt.show()

fig, ax = plt.subplots(1, 1, figsize=(10, 5)) # 가로 1개, 세로 1개, figsize
sns.kdeplot(x=df['SMOKE'], ax=ax)
sns.kdeplot(x=df[df.HTN == 2]['SMOKE'], ax=ax)
sns.kdeplot(x=df[df.DM == 2]['SMOKE'], ax=ax)
sns.kdeplot(x=df[df.LIP == 2]['SMOKE'], ax=ax)
plt.legend(['SMOKE', 'HTN', 'DM', 'LIP'])
plt.show()

fig, ax = plt.subplots(1, 1, figsize=(10, 5)) # 가로 1개, 세로 1개, figsize
sns.kdeplot(x=df['DRINK'], ax=ax)
sns.kdeplot(x=df[df.HTN == 2]['DRINK'], ax=ax)
sns.kdeplot(x=df[df.DM == 2]['DRINK'], ax=ax)
sns.kdeplot(x=df[df.LIP == 2]['DRINK'], ax=ax)
plt.legend(['DRINK', 'HTN', 'DM', 'LIP'])
plt.show()

fig, ax = plt.subplots(1, 1, figsize=(10, 5)) # 가로 1개, 세로 1개, figsize
sns.kdeplot(x=df[df.HTN == 2]['AGE'], ax=ax)
sns.kdeplot(x=df[df.DM == 2]['AGE'], ax=ax)
sns.kdeplot(x=df[df.LIP == 2]['AGE'], ax=ax)
plt.legend(['HTN', 'DM', 'LIP'])
plt.show()

fig, ax = plt.subplots(1, 1, figsize=(10, 5)) # 가로 1개, 세로 1개, figsize
sns.kdeplot(x=df[df.SEX == 1]['DRINK'], ax=ax)
sns.kdeplot(x=df[df.SEX == 2]['DRINK'], ax=ax)
plt.legend(['Male', 'Female'])
plt.show()

fig, ax = plt.subplots(1, 1, figsize=(10, 5)) # 가로 1개, 세로 1개, figsize
sns.kdeplot(x=df[df.SEX == 1]['SMOKE'], ax=ax)
sns.kdeplot(x=df[df.SEX == 2]['SMOKE'], ax=ax)
plt.legend(['Male', 'Female'])
plt.show()

sns.catplot(x='EDU', y='DRINK', hue='SEX', kind='point', data=df)
plt.show()

sns.catplot(x='EDU', y='SMOKE', hue='SEX', kind='point', data=df)
plt.show()

s = sns.catplot(x="SEX", y="HEIGHT", data=df, kind='violin').set_xticklabels(['Male', 'Female'])
s.fig.set_size_inches(10, 6)
plt.show()

s = sns.catplot(x="SEX", y="WEIGHT", data=df, kind='violin').set_xticklabels(['Male', 'Female'])
s.fig.set_size_inches(10, 6)
plt.show()

s = sns.catplot(x="SEX", y="BMI", data=df, kind='violin').set_xticklabels(['Male', 'Female'])
s.fig.set_size_inches(10, 6)
plt.show()

sns.heatmap(df[['SMOKE', 'HTN']].groupby(['SMOKE']).mean())
plt.plot()
[]

sns.heatmap(df[['SMOKE', 'DM']].groupby(['SMOKE']).mean())
plt.plot()
[]

sns.heatmap(df[['SMOKE', 'LIP']].groupby(['SMOKE']).mean())
plt.plot()
[]

sns.heatmap(df[['DRINK', 'HTN']].groupby(['DRINK']).mean())
plt.plot()
[]

sns.heatmap(df[['DRINK', 'DM']].groupby(['DRINK']).mean())
plt.plot()
[]

sns.heatmap(df[['DRINK', 'LIP']].groupby(['DRINK']).mean())
plt.plot()
[]

AWS 연동
AWS에는 좋지 않은 감정이 지만 과제 제출을 위해 다시 방문했다. 서버 배포를 위해서 아래 사이트를 참고했다
https://nerogarret.tistory.com/45
1번부터 4번 시리즈 까지만 보면 된다. 근데도 장난 아니다. 이 페이지가 진짜 엄청 자세하고 친절하게 설명했는데도 오류가 나니까, 너무 힘들었다. 배포가 이렇게 어렵다니...
완성된 웹페이지
주소 : http://ec2-54-180-144-82.ap-northeast-2.compute.amazonaws.com/health/
인데, 인스턴스를 닫으면 못 열거같아서 html 파일을 업로드 한다. 부트스트랩은 같이 설정이안돼서 허전할 수 있다
등등등..
Last updated
Was this helpful?