4 Mon
TIL
[์ธํ๋ฐ] ๋จ ๋ ์ฅ์ ๋ฌธ์๋ก ๋ฐ์ดํฐ ๋ถ์๊ณผ ์๊ฐํ ๋ฝ๊ฐ๊ธฐ
df.sort_values, rename, sort_index, reset_index๋ก ๋ฐ์ดํฐ ํ๋ ์ Reshaping ํ๊ธฐ - Reshaping Data
import pandas as pd
import numpy as np
import seaborn as sns # ์๊ฐํ ํด
df = sns.load_dataset("mpg")
df.shape
(398, 9)
df.head()
mpg
cylinders
displacement
horsepower
weight
acceleration
model_year
origin
name
0
18.0
8
307.0
130.0
3504
12.0
70
usa
chevrolet chevelle malibu
1
15.0
8
350.0
165.0
3693
11.5
70
usa
buick skylark 320
2
18.0
8
318.0
150.0
3436
11.0
70
usa
plymouth satellite
3
16.0
8
304.0
150.0
3433
12.0
70
usa
amc rebel sst
4
17.0
8
302.0
140.0
3449
10.5
70
usa
ford torino
df.sort_values('mpg')
Order rows by values of a column (low to high).
df.sort_values('mpg',ascending=False)
Order rows by values of a column (high to low).
df.sort_values('mpg').head()
mpg
cylinders
displacement
horsepower
weight
acceleration
model_year
origin
name
28
9.0
8
304.0
193.0
4732
18.5
70
usa
hi 1200d
25
10.0
8
360.0
215.0
4615
14.0
70
usa
ford f250
26
10.0
8
307.0
200.0
4376
15.0
70
usa
chevy c20
103
11.0
8
400.0
150.0
4997
14.0
73
usa
chevrolet impala
124
11.0
8
350.0
180.0
3664
11.0
73
usa
oldsmobile omega
df.sort_values('mpg', ascending=False).head()
# default๋ True์ด๋ค
mpg
cylinders
displacement
horsepower
weight
acceleration
model_year
origin
name
322
46.6
4
86.0
65.0
2110
17.9
80
japan
mazda glc
329
44.6
4
91.0
67.0
1850
13.8
80
japan
honda civic 1500 gl
325
44.3
4
90.0
48.0
2085
21.7
80
europe
vw rabbit c (diesel)
394
44.0
4
97.0
52.0
2130
24.6
82
europe
vw pickup
326
43.4
4
90.0
48.0
2335
23.7
80
europe
vw dasher (diesel)
df.sort_values?
df.rename(columns = {'y':'year'})
Rename the columns of a DataFrame
df = df.rename(columns = {'model_year' : 'year'})
df.head()
mpg
cylinders
displacement
horsepower
weight
acceleration
year
origin
name
0
18.0
8
307.0
130.0
3504
12.0
70
usa
chevrolet chevelle malibu
1
15.0
8
350.0
165.0
3693
11.5
70
usa
buick skylark 320
2
18.0
8
318.0
150.0
3436
11.0
70
usa
plymouth satellite
3
16.0
8
304.0
150.0
3433
12.0
70
usa
amc rebel sst
4
17.0
8
302.0
140.0
3449
10.5
70
usa
ford torino
df.sort_index()
Sort the index of a DataFrame
df.reset_index()
Reset index of DataFrame to row numbers, moving
index to columns
df.sort_index().head(10)
mpg
cylinders
displacement
horsepower
weight
acceleration
year
origin
name
0
18.0
8
307.0
130.0
3504
12.0
70
usa
chevrolet chevelle malibu
1
15.0
8
350.0
165.0
3693
11.5
70
usa
buick skylark 320
2
18.0
8
318.0
150.0
3436
11.0
70
usa
plymouth satellite
3
16.0
8
304.0
150.0
3433
12.0
70
usa
amc rebel sst
4
17.0
8
302.0
140.0
3449
10.5
70
usa
ford torino
5
15.0
8
429.0
198.0
4341
10.0
70
usa
ford galaxie 500
6
14.0
8
454.0
220.0
4354
9.0
70
usa
chevrolet impala
7
14.0
8
440.0
215.0
4312
8.5
70
usa
plymouth fury iii
8
14.0
8
455.0
225.0
4425
10.0
70
usa
pontiac catalina
9
15.0
8
390.0
190.0
3850
8.5
70
usa
amc ambassador dpl
df.reset_index().head(10)
# index๊ฐ ์์ ๋ index๋ฅผ ์๋ก ์์ฑ
index
mpg
cylinders
displacement
horsepower
weight
acceleration
year
origin
name
0
0
18.0
8
307.0
130.0
3504
12.0
70
usa
chevrolet chevelle malibu
1
1
15.0
8
350.0
165.0
3693
11.5
70
usa
buick skylark 320
2
2
18.0
8
318.0
150.0
3436
11.0
70
usa
plymouth satellite
3
3
16.0
8
304.0
150.0
3433
12.0
70
usa
amc rebel sst
4
4
17.0
8
302.0
140.0
3449
10.5
70
usa
ford torino
5
5
15.0
8
429.0
198.0
4341
10.0
70
usa
ford galaxie 500
6
6
14.0
8
454.0
220.0
4354
9.0
70
usa
chevrolet impala
7
7
14.0
8
440.0
215.0
4312
8.5
70
usa
plymouth fury iii
8
8
14.0
8
455.0
225.0
4425
10.0
70
usa
pontiac catalina
9
9
15.0
8
390.0
190.0
3850
8.5
70
usa
amc ambassador dpl
df.drop(columns=['Length','Height'])
Drop columns from DataFrame
df.drop(columns=['mpg','year']).head(10)
cylinders
displacement
horsepower
weight
acceleration
origin
name
0
8
307.0
130.0
3504
12.0
usa
chevrolet chevelle malibu
1
8
350.0
165.0
3693
11.5
usa
buick skylark 320
2
8
318.0
150.0
3436
11.0
usa
plymouth satellite
3
8
304.0
150.0
3433
12.0
usa
amc rebel sst
4
8
302.0
140.0
3449
10.5
usa
ford torino
5
8
429.0
198.0
4341
10.0
usa
ford galaxie 500
6
8
454.0
220.0
4354
9.0
usa
chevrolet impala
7
8
440.0
215.0
4312
8.5
usa
plymouth fury iii
8
8
455.0
225.0
4425
10.0
usa
pontiac catalina
9
8
390.0
190.0
3850
8.5
usa
amc ambassador dpl
df.drop(columns=['mpg','year', 'name']).head(10)
cylinders
displacement
horsepower
weight
acceleration
origin
0
8
307.0
130.0
3504
12.0
usa
1
8
350.0
165.0
3693
11.5
usa
2
8
318.0
150.0
3436
11.0
usa
3
8
304.0
150.0
3433
12.0
usa
4
8
302.0
140.0
3449
10.5
usa
5
8
429.0
198.0
4341
10.0
usa
6
8
454.0
220.0
4354
9.0
usa
7
8
440.0
215.0
4312
8.5
usa
8
8
455.0
225.0
4425
10.0
usa
9
8
390.0
190.0
3850
8.5
usa
melt, pivot ์ผ๋ก Tidy Data ๋ง๋ค๊ธฐ - Reshaping Data, Method Chaining
pd.melt(df)
Gather columns into rows.
pd.melt?
df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'},
... 'B': {0: 1, 1: 3, 2: 5},
... 'C': {0: 2, 1: 4, 2: 6}})
df
A
B
C
0
a
1
2
1
b
3
4
2
c
5
6
pd.melt(df, id_vars=['A'], value_vars=['B'])
A
variable
value
0
a
B
1
1
b
B
3
2
c
B
5
pd.melt(df, id_vars=['A'], value_vars=['B', 'C'])
A
variable
value
0
a
B
1
1
b
B
3
2
c
B
5
3
a
C
2
4
b
C
4
5
c
C
6
pd.melt(df, value_vars=['A','B','C'])
variable
value
0
A
a
1
A
b
2
A
c
3
B
1
4
B
3
5
B
5
6
C
2
7
C
4
8
C
6
pd.melt(df, value_vars=['A','B','C']).rename(columns={
'variable' : 'var',
'value' : 'val'})
var
val
0
A
a
1
A
b
2
A
c
3
B
1
4
B
3
5
B
5
6
C
2
7
C
4
8
C
6
df.pivot(columns='var', values='val')
Spread rows into columns.
df.pivot?
df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two',
... 'two'],
... 'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
... 'baz': [1, 2, 3, 4, 5, 6],
... 'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
df
foo
bar
baz
zoo
0
one
A
1
x
1
one
B
2
y
2
one
C
3
z
3
two
A
4
q
4
two
B
5
w
5
two
C
6
t
df2 = df.pivot(index='foo', columns='bar', values='baz')
# row์ ์๋ ๊ฐ๋ค์ด column์ผ๋ก ์ด๋
df2
bar
A
B
C
foo
one
1
2
3
two
4
5
6
df3 = df.pivot(index='foo', columns='bar', values='baz').reset_index()
df3
bar
foo
A
B
C
0
one
1
2
3
1
two
4
5
6
df3.melt(id_vars=['foo'], value_vars=['A','B','C'])
foo
bar
value
0
one
A
1
1
two
A
4
2
one
B
2
3
two
B
5
4
one
C
3
5
two
C
6
df3.melt(id_vars=['foo'], value_vars=['A','B','C']).sort_values('bar')
foo
bar
value
0
one
A
1
1
two
A
4
2
one
B
2
3
two
B
5
4
one
C
3
5
two
C
6
df3.melt(id_vars=['foo'], value_vars=['A','B','C']).sort_values(['foo', 'bar'])
foo
bar
value
0
one
A
1
2
one
B
2
4
one
C
3
1
two
A
4
3
two
B
5
5
two
C
6
df3.melt(id_vars=['foo'], value_vars=['A','B','C']).sort_values(['foo', 'bar']).rename(columns = {'value': 'baz'})
foo
bar
baz
0
one
A
1
2
one
B
2
4
one
C
3
1
two
A
4
3
two
B
5
5
two
C
6
pd.concat([df1,df2]) ์๋ฆฌ์ฆ, ๋ฐ์ดํฐํ๋ ์ ํฉ์น๊ธฐ - Reshaping Data
pd.concat([df1,df2])
Append rows of DataFrames
pd.concat([df1,df2], axis=1)
Append columns of DataFrames
pd.concat?
s1 = pd.Series(['a', 'b'])
s1
0 a
1 b
dtype: object
s2 = pd.Series(['c', 'd'])
s2
0 c
1 d
dtype: object
pd.concat([s1, s2])
0 a
1 b
0 c
1 d
dtype: object
pd.concat([s1, s2], ignore_index=True)
0 a
1 b
2 c
3 d
dtype: object
pd.concat([s1, s2], keys=['s1', 's2'])
s1 0 a
1 b
s2 0 c
1 d
dtype: object
pd.concat([s1, s2], keys=['s1', 's2'],
... names=['Series name', 'Row ID'])
Series name Row ID
s1 0 a
1 b
s2 0 c
1 d
dtype: object
df1 = pd.DataFrame([['a', 1], ['b', 2]],
... columns=['letter', 'number'])
df1
letter
number
0
a
1
1
b
2
df2 = pd.DataFrame([['c', 3], ['d', 4]],
... columns=['letter', 'number'])
df2
letter
number
0
c
3
1
d
4
pd.concat([df1, df2])
letter
number
0
a
1
1
b
2
0
c
3
1
d
4
df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']],
... columns=['letter', 'number', 'animal'])
df3
letter
number
animal
0
c
3
cat
1
d
4
dog
# ํํ๊ฐ ๋ค๋ฅธ ๋๊ฐ์ ๋ฐ์ดํฐ ํ๋ ์ ํฉ์น๊ธฐ
pd.concat([df1, df3])
letter
number
animal
0
a
1
NaN
1
b
2
NaN
0
c
3
cat
1
d
4
dog
pd.concat([df1, df3], join="inner")
letter
number
0
a
1
1
b
2
0
c
3
1
d
4
df4 = pd.DataFrame([['bird', 'polly'], ['monkey', 'george']],
... columns=['animal', 'name'])
df4
animal
name
0
bird
polly
1
monkey
george
df5 = pd.DataFrame([1], index=['a'])
df5
0
a
1
df6 = pd.DataFrame([2], index=['a'])
df6
0
a
2
pd.concat([df5, df6])
0
a
1
a
2
pd.concat([df5, df6], verify_integrity=True)
# ์ค๋ฅ ๋ฐ์ : ValueError
merge๋ก ๋ฐ์ดํฐํ๋ ์ ํฉ์น๊ธฐ left, right, inner, outer ์ต์
์ฌ์ฉํ๊ธฐ - Combine Data Sets
import pandas as pd
adf = pd.DataFrame({"x1" : ["A", "B", "C"], "x2" : [1, 2, 3]})
adf
x1
x2
0
A
1
1
B
2
2
C
3
bdf = pd.DataFrame({"x1" : ["A", "B", "D"], "x3" : ["T", "F", "T"]})
bdf
x1
x3
0
A
T
1
B
F
2
D
T
pd.merge(adf, bdf,
how='left', on='x1')
Join matching rows from bdf to adf.
pd.merge(adf, bdf,
how='right', on='x1')
Join matching rows from adf to bdf.
pd.merge(adf, bdf,
how='inner', on='x1')
Join data. Retain only rows in both sets.
pd.merge(adf, bdf,
how='outer', on='x1')
Join data. Retain all values, all rows.
pd.merge(adf, bdf, how='left', on='x1')
x1
x2
x3
0
A
1
T
1
B
2
F
2
C
3
NaN
pd.merge(adf, bdf, how='right', on='x1')
x1
x2
x3
0
A
1.0
T
1
B
2.0
F
2
D
NaN
T
pd.merge(adf, bdf, how='inner', on='x1')
x1
x2
x3
0
A
1
T
1
B
2
F
pd.merge(adf, bdf, how='outer', on='x1')
x1
x2
x3
0
A
1.0
T
1
B
2.0
F
2
C
3.0
NaN
3
D
NaN
T
adf[adf.x1.isin(bdf.x1)]
All rows in adf that have a match in bdf.
adf[~adf.x1.isin(bdf.x1)]
All rows in adf that do not have a match in bdf.
adf.x1.isin(bdf.x1)
0 True
1 True
2 False
Name: x1, dtype: bool
adf[adf.x1.isin(bdf.x1)]
x1
x2
0
A
1
1
B
2
adf[~adf.x1.isin(bdf.x1)]
x1
x2
2
C
3
pd.merge(ydf, zdf)
Rows that appear in both ydf and zdf
(Intersection).
pd.merge(ydf, zdf, how='outer')
Rows that appear in either or both ydf and zdf
(Union).
pd.merge(ydf, zdf, how='outer',
indicator=True)
.query('_merge == "left_only"')
.drop(columns=['_merge'])
Rows that appear in ydf but not zdf (Setdiff).
ydf = pd.DataFrame({"x1" : ["A", "B", "C"], "x2" : [1, 2, 3]})
ydf
x1
x2
0
A
1
1
B
2
2
C
3
zdf = pd.DataFrame({"x1" : ["B", "C", "D"], "x2" : [2, 3, 4]})
zdf
x1
x2
0
B
2
1
C
3
2
D
4
pd.merge(ydf, zdf)
# default : inner join
x1
x2
0
B
2
1
C
3
pd.merge(ydf, zdf, how='outer')
x1
x2
0
A
1
1
B
2
2
C
3
3
D
4
pd.merge(ydf, zdf, how='outer', indicator=True)
x1
x2
_merge
0
A
1
left_only
1
B
2
both
2
C
3
both
3
D
4
right_only
pd.merge(ydf, zdf, how='outer', indicator=True).query('_merge == "left_only"')
x1
x2
_merge
0
A
1
left_only
pd.merge(ydf, zdf, how='outer',indicator=True).query('_merge == "left_only"').drop(columns=['_merge'])
x1
x2
0
A
1
concat์ ์์๋๋ก ํฉ์น ๋, mergs๋ ์ข์ฐ๋ก ํฉ์น ๋ ์ฌ์ฉํ๋ฉด ์ข๋ค
ํ์ด์ฌ ํ๋ค์ค๋ก groupby ํ์ฉํ์ฌ ๋ค์ํ ๋ฐ์ดํฐ ์ง๊ณ๋ฅผ ํ์ฉํ๊ธฐ - Group Data
import pandas as pd
import seaborn as sns
df.groupby(by="col")
Return a GroupBy object,
grouped by values in column
named "col".
df.groupby(level="ind")
Return a GroupBy object,
grouped by values in index
level named "ind".
size()
Size of each group.
agg(function)
Aggregate group using function.
df = sns.load_dataset("mpg")
df.head()
mpg
cylinders
displacement
horsepower
weight
acceleration
model_year
origin
name
0
18.0
8
307.0
130.0
3504
12.0
70
usa
chevrolet chevelle malibu
1
15.0
8
350.0
165.0
3693
11.5
70
usa
buick skylark 320
2
18.0
8
318.0
150.0
3436
11.0
70
usa
plymouth satellite
3
16.0
8
304.0
150.0
3433
12.0
70
usa
amc rebel sst
4
17.0
8
302.0
140.0
3449
10.5
70
usa
ford torino
df.groupby(by="origin")
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000017AD720BE88>
df.groupby(by="origin").size()
origin
europe 70
japan 79
usa 249
dtype: int64
df['origin'].value_counts()
usa 249
japan 79
europe 70
Name: origin, dtype: int64
df.groupby(by="origin").max()
mpg
cylinders
displacement
horsepower
weight
acceleration
model_year
name
origin
europe
44.3
6
183.0
133.0
3820
24.8
82
vw rabbit custom
japan
46.6
6
168.0
132.0
2930
21.0
82
toyouta corona mark ii (sw)
usa
39.0
8
455.0
230.0
5140
22.2
82
pontiac ventura sj
df.groupby(by="origin").min()
mpg
cylinders
displacement
horsepower
weight
acceleration
model_year
name
origin
europe
16.2
4
68.0
46.0
1825
12.2
70
audi 100 ls
japan
18.0
3
70.0
52.0
1613
11.4
70
datsun 1200
usa
9.0
4
85.0
52.0
1800
8.0
70
amc ambassador brougham
df.groupby(by="origin")['weight'].mean()
origin
europe 2423.300000
japan 2221.227848
usa 3361.931727
Name: weight, dtype: float64
df.groupby(by="origin")['weight'].median()
origin
europe 2240
japan 2155
usa 3365
Name: weight, dtype: int64
df.groupby?
df.groupby(['origin', 'model_year'])['cylinders'].mean()
origin model_year
europe 70 4.000000
71 4.000000
72 4.000000
73 4.000000
74 4.000000
75 4.000000
76 4.250000
77 4.000000
78 4.833333
79 4.250000
80 4.111111
81 4.500000
82 4.000000
japan 70 4.000000
71 4.000000
72 3.800000
73 4.250000
74 4.000000
75 4.000000
76 4.500000
77 4.166667
78 4.000000
79 4.000000
80 4.076923
81 4.333333
82 4.000000
usa 70 7.636364
71 6.200000
72 6.888889
73 7.241379
74 6.266667
75 6.400000
76 6.363636
77 6.222222
78 6.000000
79 6.260870
80 4.285714
81 4.923077
82 4.300000
Name: cylinders, dtype: float64
pd.DataFrame(df.groupby(['origin', 'model_year'])['cylinders'].mean())
cylinders
origin
model_year
europe
70
4.000000
71
4.000000
72
4.000000
73
4.000000
74
4.000000
75
4.000000
76
4.250000
77
4.000000
78
4.833333
79
4.250000
80
4.111111
81
4.500000
82
4.000000
japan
70
4.000000
71
4.000000
72
3.800000
73
4.250000
74
4.000000
75
4.000000
76
4.500000
77
4.166667
78
4.000000
79
4.000000
80
4.076923
81
4.333333
82
4.000000
usa
70
7.636364
71
6.200000
72
6.888889
73
7.241379
74
6.266667
75
6.400000
76
6.363636
77
6.222222
78
6.000000
79
6.260870
80
4.285714
81
4.923077
82
4.300000
df.pivot_table?
Signature:
df.pivot_table(
values=None,
index=None,
columns=None,
aggfunc='mean',
fill_value=None,
margins=False,
dropna=True,
margins_name='All',
observed=False,
) -> 'DataFrame'
df2 = pd.DataFrame(
[[4, 7, 10],
[5, 8, 11],
[6, 9, 12]],
index=[1, 2, 3],
columns=['a', 'b', 'c'])
df2
a
b
c
1
4
7
10
2
5
8
11
3
6
9
12
df2.shift(1)
a
b
c
1
NaN
NaN
NaN
2
4.0
7.0
10.0
3
5.0
8.0
11.0
df2.shift(-1)
a
b
c
1
5.0
8.0
11.0
2
6.0
9.0
12.0
3
NaN
NaN
NaN
df2['a'].shift(2)
1 NaN
2 NaN
3 4.0
Name: a, dtype: float64
df2['b'].shift(-1)
1 8.0
2 9.0
3 NaN
Name: b, dtype: float64
df2['b'] = df2['b'].shift(-1)
df2
a
b
c
1
4
9.0
10
2
5
NaN
11
3
6
NaN
12
df['model_year']
0 70
1 70
2 70
3 70
4 70
..
393 82
394 82
395 82
396 82
397 82
Name: model_year, Length: 398, dtype: int64
df['model_year'].rank(method='max')
# ํด๋น ๊ฐ์ด ํฐ ์์ผ๋ก ๋ช๋ฒ์งธ ์์์ธ์ง
0 29.0
1 29.0
2 29.0
3 29.0
4 29.0
...
393 398.0
394 398.0
395 398.0
396 398.0
397 398.0
Name: model_year, Length: 398, dtype: float64
df['model_year'].rank(method='min')
# ํด๋น ๊ฐ์ด ์์ ์์ผ๋ก ๋ช๋ฒ์งธ ์์์ธ์ง
0 1.0
1 1.0
2 1.0
3 1.0
4 1.0
...
393 368.0
394 368.0
395 368.0
396 368.0
397 368.0
Name: model_year, Length: 398, dtype: float64
df['model_year'].rank(method='min').value_counts()
86.0 40
245.0 36
183.0 34
368.0 31
153.0 30
339.0 29
281.0 29
310.0 29
1.0 29
217.0 28
58.0 28
30.0 28
126.0 27
Name: model_year, dtype: int64
df['model_year'].rank(pct=True).head()
# pct๋ percentage๋ฅผ ์๋ฏธ
0 0.037688
1 0.037688
2 0.037688
3 0.037688
4 0.037688
Name: model_year, dtype: float64
df['model_year'].rank(method='first').head()
# ๋จผ์ ๋ฑ์ฅํ๋ ์์
0 1.0
1 2.0
2 3.0
3 4.0
4 5.0
Name: model_year, dtype: float64
df.rank?
df2
a
b
c
1
4
9.0
10
2
5
NaN
11
3
6
NaN
12
df2.cumsum()
# ๋์ ๊ฐ ๊ตฌํ๊ธฐ
a
b
c
1
4
9.0
10
2
9
NaN
21
3
15
NaN
33
df2['b'] = [9.0, 13.0, 11.5]
df2.cummax()
# df[b,3]์ ๊ฐ์ด 13๋ณด๋ค ์์ 11.5์ด๋ฏ๋ก ๊ทธ๋๋ก ์ ์ง๋๋ ๋ชจ์ต
a
b
c
1
4
9.0
10
2
5
13.0
11
3
6
13.0
12
df2.cummin()
a
b
c
1
4
9.0
10
2
4
9.0
10
3
4
9.0
10
df2.cumprod()
# ๋์ ๊ณฑ
a
b
c
1
4
9.0
10
2
20
117.0
110
3
120
1345.5
1320
๋งํฌ๋ค์ด ์์์
๋ ฅ
์ด๋ฒ์ฃผ์ฐจ ๊ฐ์๊ฐ ์์์ ์ ๋ ฅํ ์ผ์ด ๋ง์ ์๋ ์ฌ์ดํธ๋ฅผ ์๊ฐํ๋ค. (๋ฌผ๋ก ๋์๊ฒ)
https://math.meta.stackexchange.com/questions/5020/mathjax-basic-tutorial-and-quick-reference
[ํ๋ก๊ทธ๋๋จธ์ค AI ์ค์ฟจ 1๊ธฐ] 5์ฃผ์ฐจ DAY 1
Machine Learning ๊ธฐ์ด - ์๊ฐ
๋จธ์ ๋ฌ๋์ด๋?
๊ธฐ๊ณํ์ต. ๊ฒฝํ์ ํตํด ์๋์ผ๋ก ๊ฐ์ ํ๋ ์ปดํจํฐ ์๊ณ ๋ฆฌ์ฆ์ ์ฐ๊ตฌ.
ํ์ต๋ฐ์ดํฐ : ์ ๋ ฅ๋ฒกํฐ๋ค๊ณผ ๋ชฉํ๊ฐ๋ค
๋จธ์ ๋ฌ๋ ์๊ณ ๋ฆฌ์ฆ์ ๊ฒฐ๊ณผ๋ ๋ชฉํ๊ฐ์ ์์ธกํ๋ ํจ์
์ซ์ ์ธ์์์ ์ ๋ ฅ๋ฒกํฐ๋ ์๊ธ์จ ์ด๋ฏธ์ง, ๋ชฉํ๊ฐ์ 0๋ถํฐ 9๊น์ง ์ซ์์ค ์์ธก๊ฐ. ์ ํํ๋ 10๊ฐ์ ํด๋์ค ์ค ํ ํด๋์ค(์ด ๋ ํด๋์ค ๋๋ฒ๋ 1๋ถํฐ ์์ํจ)
ํต์ฌ๊ฐ๋
s
ํ์ต๋จ๊ณ: ํจ์ y(x)๋ฅผ ํ์ต๋ฐ์ดํฐ์ ๊ธฐ๋ฐํด ๊ฒฐ์ ํ๋ ๋จ๊ณ
์ํ์ : ๋ชจ๋ธ์ ํ๊ฐํ๊ธฐ ์ํด ์ฌ์ฉํ๋ ์๋ก์ด ๋ฐ์ดํฐ
์ผ๋ฐํ : ๋ชจ๋ธ์์ ํ์ต์ ์ฌ์ฉ๋ ๋ฐ์ดํฐ๊ฐ ์๋ ์ด์ ์ ์ ํ์ง ๋ชปํ ๋ก์ด ๋ฐ์ดํฐ์ ๋ํด ์ฌ๋ฐ๋ฅธ ์์ธก์ ์ํํ๋ ์ญ๋
์ง๋ํ์ต : ๋ถ๋ฅ์ ํ๊ท
๋น์ง๋ํ์ต : ๊ตฐ์ง
๋คํญ์ ๊ณก์ ๊ทผ์ฌ

Polynomial Curve Fitting
ํ๊ท ๋ฌธ์ ์ ํด๋นํ๋ค.
์ ๋ค์ ์ง๋๋(์ง๋์ง ๋ชปํ๋๋ผ๋ ์ต๋ํ ๊ฐ๊น๊ฒ) ํจ์ ๊ตฌํ๊ธฐ
ํ์ต๋ฐ์ดํฐ : ์ ๋ ฅ๋ฒกํฐ์ ๋ชฉํ๊ฐ
๋ชฉํ : ์๋ก์ด ์ ๋ ฅ๋ฒกํฐ๊ฐ ์ฃผ์ด์ง ๋ ๋ชฉํ๊ฐ์ ์์ธกํ๋ ๊ฒ
ํ๋ฅ ์ด๋ก : ์์ธก๊ฐ์ ๋ถํ์ค์ฑ์ ์ ๋ํ์์ผ ํํํ ์ ์๋ ์ํ์ ํ๋ ์์ํฌ ์ ๊ณต
๊ฒฐ์ ์ด๋ก : ํ๋ฅ ์ ํํ์ ๋ฐํ์ผ๋ก ์ต์ ์ ์์ธก์ ์ํํ ์ ์๋ ๋ฐฉ๋ฒ๋ก ์ ๊ณต

๊ณผ์์ ํฉ๊ณผ ๊ณผ๋์ ํฉ
์ค์ ํ์ต ๋ฐ์ดํฐ์ ํฌ๊ธฐ์ ๋นํด ๋๋ฌด ๊ณ ์ฐจ์ ํจ์ ๋๋ ๋๋ฌด ์ ์ฐจ์ ํจ์๋ฅผ ์ฌ์ฉํ๋ฉด ์ค์ ์ฑ๋ฅ์์ ์๋ฌ๊ฐ ๋ง์ด ๋ฐ์ํ๋ค.
: Root Mean Square
๋ํ, ๊ณ ์ฐจ์ ํจ์๋๋ผ๋ ๋ง์ ์์ ๋ฐ์ดํฐ๊ฐ ์กด์ฌํ๋ค๋ฉด ๊ณผ๋์ ํฉ์ด ๋ฐ์ํ ๊ฐ๋ฅ์ฑ์ด ์ ์ด์ง๋ค. ๊ทธ๋ฆฌ๊ณ ์ค์ ๋ก ๋จธ์ ๋ฌ๋์ ๋ฐ์ดํฐ ์๋ ๊ต์ฅํ ๋ง๋ค๋ ์ .
๊ท์ ํ(Regularization)
ํ๋ผ๋ฏธํฐ๊ฐ์ด ๋๋ฌด ์ปค์ง์ง ์๋๋ก ํ๋ ๋ฐฉ๋ฒ.

์ด ๋ ๋๋ฌด ์ฌํ๊ฒ ํ๋ฉด ๊ณผ๋/๊ณผ์ ์ ํฉ์ด ์๊ธฐ์น ์๊ฒ ๋ฐ์ํ ์ ์์.
Machine Learning ๊ธฐ์ด - ํ๋ฅ ์ด๋ก 1
ํ๋ฅ ๋ณ์
ํ๋ฅ ๋ณ์ X๋ ํ๋ณธ์ ์งํฉ S์ ์์ e๋ฅผ ์ค์๊ฐ X(e) = x์ ๋์์ํค๋ ํจ์์ด๋ค.
๋๋ฌธ์ X, Y, ... : ํ๋ฅ ๋ณ์
์๋ฌธ์ x, y, ... : ํ๋ฅ ๋ณ์๊ฐ ๊ฐ์ง ์ ์๋ ๊ฐ
ํ๋ฅ P๋ ์งํฉ S์ ๋ถ๋ถ์งํฉ์ ์ค์๊ฐ์ ๋์์ํค๋ ํจ
ex) S = {HH, HT, TH, TT}; throwing coin
X(HH) = 2, X(HT) = 1, X(TH) = 1, X(TT) = 0; head of coin appear
P[X = 1] = P[{HT, TH}] =
์ฐ์ ํ๋ฅ ๋ณ์(Continuous Random Variables)
๋์ ๋ถํฌํจ์ F(x) = P[X (-, x)] ์ผ ๋, F(x)๋ฅผ ๊ฐ์ง ํ๋ฅ ๋ณ์ X์ ๋ํด์ ๋ค์์ ๋ง์กฑํ๋ ํจ์ f(x)๊ฐ ์กด์ฌํ๋ค๋ฉด X๋ฅผ ์ฐ์ ํ๋ฅ ๋ณ์๋ผ๊ณ ๋ถ๋ฅด๊ณ f(x)๋ฅผ X์ ํ๋ฅ ๋ฐ๋ ํจ์(probability density function)๋ผ๊ณ ๋ถ๋ฅธ๋ค.
ํ๋ฅ ๋ณ์๋ฅผ ๋ช ํํ ํ๊ธฐ ์ํด F(x), f(x)๋ก ์ฐ๊ธฐ๋ก ํ๋ฉฐ ๋ฐ๋ ํจ์์ ๊ฒฝ์ฐ์๋ p(x)๋ฅผ ์ฌ์ฉํ๊ธฐ๋ ํ๋ค.
ํ๋ฅ ๋ณ์์ ์ฑ์ง
๋ง์ ๋ฒ์น
๊ณฑ์ ๋ฒ์น

๋ฒ ์ด์ฆ ํ๋ฅ (posterior ์ฌํํ๋ฅ , likelihood ๊ฐ๋ฅ์ฑ, prior ์ฌ์ ํ๋ฅ , marginal normalization ๊ฒฝ๊ณํ๋ฅ )

ํ๋ฅ ๋ณ์์ ํจ์
ํ๋ฅ ๋ณ์ X์ ํจ์ Y = f(X)๋ ํ๋ฅ ๋ณ์์ด๋ค. ์๋ฅผ ๋ค์ด ํ๋ฅ ๋ณ์ X๊ฐ ์ฃผ(week)์ ์๋ก ํํ๋์๋ค๊ณ ํ๋ฉด ์ผ(day)์ ์๋ก ํํ๋ ์๋ก์ด ํ๋ฅ ๋ณ์๋ฅผ ์ ์ํ ์ ์๋ค.
Y = 7X
P[14 <= Y <= 21] = P[2 <= X <= 3]
k์ฐจ์์ ํ๋ฅ ๋ณ์ ๋ฒกํฐ x = (x1, ... , xk)๊ฐ ์ฃผ์ด์ง ๋, k๊ฐ์ x์ ๊ดํ ํจ์๋ค์ ์๋ก์ด ํ๋ฅ ๋ณ์๋ฒกํฐ y = (y1, ... yk)๋ฅผ ์ ์ํ๋ค. ๊ฐ๋ตํ๊ฒ y = (x)๋ก ๋ํ๋ผ ์ ์๋ค. ๋ง์ฝ y = g(x)๊ฐ ์ผ๋์ผ ๋ณํ์ธ ๊ฒฝ์ฐ(x = w(y)๋ก ์ ์ผํ ํด๋ฅผ ๊ฐ์ง ๋), y์ ๊ฒฐํฉํ๋ฅ ๋ฐ๋ํจ์๋ ๋ค์๊ณผ ๊ฐ๋ค.
where J =
์์
์ผ ๋, ์ ์ํด์ ์ ์๋๋ y์ pdf๋?
Inverse CDF Technique๋ฅผ ์ฌ์ฉํ๋ฉด ๋ฐ๊ฒฝ์ด r์ธ ์ ์์ ๋๋คํ๊ฒ ์ ์ ์ฐ์ ์ ์๋ค.
๊ธฐ๋๊ฐ
ํ๋ฅ ๋ถํฌ p(x)ํ์์ ํจ์ f(x)์ ํ๊ท ๊ฐ
๋ถ์ฐ๊ณผ ๊ณต๋ถ์ฐ
f(x)์ ๋ถ์ฐ : f(x)์ ๊ฐ๋ค์ด ๊ธฐ๋๊ฐ์ผ๋ก๋ถํฐ ํฉ์ด์ ธ ์๋ ์ ๋
ํ๋ฅ ์ ํด์ํ๋ ๋ ๊ฐ์ง ๋ค๋ฅธ๊ด์ : ๋น๋์ฃผ์ ๋ ๋ฒ ์ด์ง์
๋น๋์ฃผ์ : ๋ฐ๋ณต๊ฐ๋ฅํ ์ฌ๊ฑด๋ค์ ๋น๋์์ ๊ธฐ๋ฐ
๋ฒ ์ด์ง์ : ๋ถํ์ค์ฑ์ ์ ๋์ ์ผ๋ก ํํ
๋ถ๊ทน ์ผ์์ด ์ด๋ฒ ์ธ๊ธฐ๋ง๊น์ง ๋ น์ ์์ด์ง ํ๋ฅ ์ ๋ฐ๋ณต๊ฐ๋ฅํ์ง ์์ ์ฌ๊ฑด์
๋น๋์ฃผ์๋ ์ถ์ ์ฌ๋ฅผ ์ฌ์ฉํด์ ํ๋ฅ ์ ๊ตฌํ๋ฉฐ ๊ตฌํด์ง ํ๋ผ๋ฏธํฐ์ ๋ถํ์ค์ฑ์ ๋ถํธ์คํธ๋ฉ ๋ฐฉ๋ฒ์ ์ด์ฉํด ๊ตฌํ๋ค.
๋ฒ ์ด์ง์์ ์ฌ์ ํ๋ฅ ์ ๋ชจ๋ธ์ ํฌํจ์ํฌ ์ ์๋ ์ฅ์ ์ด ์๋ค.
์ ๊ท๋ถํฌ
๋จ์ผ๋ณ์ x๋ฅผ ์ํ ๊ฐ์ฐ์์ ๋ถํฌ
Last updated
Was this helpful?