from google.colab import drive
drive.mount('/content/gdrive/')

# !pip install --target='/content/gdrive/My Drive/env' mxnet-cu101mkl==1.6.0  # updating mxnet to at least v1.6
# ConnectionResetError: [Errno 104] Connection reset by peer
!pip install -U mxnet-cu101mkl==1.6.0  # updating mxnet to at least v1.6"

1.Getting Start with MXNET¶

가장 먼저 데이터를 변수에 저장해 봅시다. 데이터는 간단히 말하면 1차원의 데이터일때는 vector, 2차원일때는 metrix, 3차원 이상일때는 지칭하는 이름이 특별히 없고, 통틀어서 tensor라고 주로 부릅니다. 간단하게 원소 12개의 숫자로 이루어진 vector는 아래와 같이 표현할 수 있으며, 추가적인 설청을 하지 않으면 이 tensor는 main memory에 저장되어 CPU-based 연산을 위해 사용할 수 있도록 디자인 될 것 입니다.

MXNet의 np와 numpy 가 유사하지만 혹시 차이점이 있을까 하여 numpy도 함께 실습해보겠습니다

from mxnet import np,npx #numpy와 numpy extension
import numpy as onp # original numpy

# 먼저 12개의 수를 이용한 벡터를 x변수에 저장합니다
print("np on MXNet")
x=np.arange(12) 
print(x)
print(x.shape)
print(type(x))

print("\noriginal numpy")
ox=onp.arange(12)
print(ox)
print(ox.shape)
print(type(ox))

np on MXNet
[ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11.]
(12,)
<class 'mxnet.numpy.ndarray'>

original numpy
[ 0  1  2  3  4  5  6  7  8  9 10 11]
(12,)
<class 'numpy.ndarray'>

# shape을 가공해봅시다
print("np on MXNet")
x1=x.reshape(3,4)
x2=x.reshape(2,-1,2) # == x2=x.reshape(2,3,2)
print(x1)
print(x1.shape)
print(x2)
print(x2.shape)

print("\noriginal numpy")
ox1=ox.reshape(3,4)
ox2=ox.reshape(2,3,2)
print(ox1)
print(ox1.shape)
print(ox2)
print(ox2.shape)

np on MXNet
[[ 0.  1.  2.  3.]
 [ 4.  5.  6.  7.]
 [ 8.  9. 10. 11.]]
(3, 4)
[[[ 0.  1.]
  [ 2.  3.]
  [ 4.  5.]]

 [[ 6.  7.]
  [ 8.  9.]
  [10. 11.]]]
(2, 3, 2)

original numpy
[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
(3, 4)
[[[ 0  1]
  [ 2  3]
  [ 4  5]]

 [[ 6  7]
  [ 8  9]
  [10 11]]]
(2, 3, 2)

# 다양한 방법으로 tensor를 선언해 봅시다
print("np on MXNet")
print(np.zeros((2,3,1)))
print(np.ones((2,3,1)))
print(np.random.normal(0,1,size=(3,4)))
print(np.array([[1,54],[2,56]]))

print("\noriginal numpy")
print(onp.zeros((2,3,1)))
print(onp.ones((2,3,1)))
print(onp.random.normal(0,1,size=(3,4)))
print(onp.array([[1,54],[2,56]]))

np on MXNet
[[[0.]
  [0.]
  [0.]]

 [[0.]
  [0.]
  [0.]]]
[[[1.]
  [1.]
  [1.]]

 [[1.]
  [1.]
  [1.]]]
[[-1.771029   -0.5459446  -0.45138445 -2.3556297 ]
 [ 0.57938355  0.5414402  -1.856082    2.6785066 ]
 [-1.9768796   1.2546344  -0.20801921 -0.54877394]]
[[ 1. 54.]
 [ 2. 56.]]

original numpy
[[[0.]
  [0.]
  [0.]]

 [[0.]
  [0.]
  [0.]]]
[[[1.]
  [1.]
  [1.]]

 [[1.]
  [1.]
  [1.]]]
[[-0.25167444  0.87177608  1.54761448  0.59364384]
 [ 1.48541644  0.93959338 -0.33410851 -1.83669898]
 [-0.58915769 -0.27788139  1.14203692 -1.86398843]]
[[ 1 54]
 [ 2 56]]

요소별 사칙연산. 공통 표준 산술연산자 (+,-,*,/ㅡ,**) 는 텐서의 shape이 같을때 모두 요소별 연산을 한다.

x=np.array([1,2,3,4])
y=np.array([6,4,2,1])
x + y, x - y, x * y, x / y, x ** y

(array([7., 6., 5., 5.]),
 array([-5., -2.,  1.,  3.]),
 array([6., 8., 6., 4.]),
 array([0.16666667, 0.5       , 1.5       , 4.        ]),
 array([ 1., 16.,  9.,  4.]))

np.exp(x)

array([ 2.7182817,  7.389056 , 20.085537 , 54.59815  ])

x=np.arange(12).reshape(3,4)
y=np.random.normal(0,12,size=(3,4))
print(np.concatenate([x,y], axis=0).shape)
print(np.concatenate([x,y], axis=1).shape)
print(x, y, x==y)
print(y.sum())

(6, 4)
(3, 8)
[[ 0.  1.  2.  3.]
 [ 4.  5.  6.  7.]
 [ 8.  9. 10. 11.]] [[-11.654632    6.043085   -6.9907475 -14.27334  ]
 [  4.4604926  -6.6025643  11.160087  -19.102507 ]
 [-17.070906  -13.298335   -6.211439    0.9446422]] [[False False False False]
 [False False False False]
 [False False False False]]
-72.59616

x=onp.array([1,2,3,4])
y=onp.array([6,4,2,1])
x + y, x - y, x * y, x / y, x ** y

(array([7, 6, 5, 5]),
 array([-5, -2,  1,  3]),
 array([6, 8, 6, 4]),
 array([0.16666667, 0.5       , 1.5       , 4.        ]),
 array([ 1, 16,  9,  4]))

onp.exp(x)

array([ 2.71828183,  7.3890561 , 20.08553692, 54.59815003])

x=onp.arange(12).reshape(3,4)
y=onp.random.normal(0,12,size=(3,4))
print(onp.concatenate([x,y], axis=0).shape)
print(onp.concatenate([x,y], axis=1).shape)
print(x, y, x==y)
print(y.sum())

(6, 4)
(3, 8)
[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]] [[  0.6412043   -8.2299687    4.4613359    5.92456812]
 [ -9.78738279   1.28626319   8.33468643   5.1567833 ]
 [  2.85440103 -20.10400148  -3.03958966   2.55076974]] [[False False False False]
 [False False False False]
 [False False False False]]
-9.950930617511077

tensor의 형식이 다를때는 Brodcasting Mechanism이 적용된다

a=np.arange(3).reshape(3,1)
b=np.arange(2).reshape(1,2)
a,b

(array([[0.],
        [1.],
        [2.]]), array([[0., 1.]]))

a,b,a==b,a+b

(array([[0.],
        [1.],
        [2.]]), array([[0., 1.]]), array([[ True, False],
        [False,  True],
        [False, False]]), array([[0., 1.],
        [1., 2.],
        [2., 3.]]))

a=onp.arange(3).reshape(3,1)
b=onp.arange(2).reshape(1,2)
a,b, a==b, a+b

(array([[0],
        [1],
        [2]]), array([[0, 1]]), array([[ True, False],
        [False,  True],
        [False, False]]), array([[0, 1],
        [1, 2],
        [2, 3]]))

배열의 인덱스를 이용하여 더욱 편리하게 텐서를 다룰 수 있다

x=np.arange(12)
x[-1],x.reshape(3,4)[:,:],x.reshape(3,4)[1:2,:2]

(array([11.]), array([[ 0.,  1.,  2.,  3.],
        [ 4.,  5.,  6.,  7.],
        [ 8.,  9., 10., 11.]]), array([[4., 5.]]))

연산을 좀 더 메모리 효율적으로 진행하여보자. 연산을 수행할 때마다 결과는 새로운 메모리에 할당된다. 이는 python이 y+x 연산을 진행한 후, 새로운 메모리에 이를 저장하고, y가 이 메모리를 지정(point)하도록 연산이 진행되기 때문이다.

x=np.arange(5)
y=np.arange(5,10)
print(id(y))
y=np.arange(5,10)
print(id(y)) # 심지어 재배정시에도 바뀐다
before=id(y)
y=y+x
id(y)==before

140081308139880
140081308142544

False

# numpy도 마찬가지
x=onp.arange(5)
y=onp.arange(5,10)
print(id(y))
y=onp.arange(5,10)
print(id(y))
before=id(y)
y=y+x
id(y)==before

140081308500176
140081307873888

False

이러한 방식의 연산은 효율적이지 않다. 특히 머신러닝에서는 수많은 파라미터연산이 진행되므로 이렇게 필수적이지 않은 메모리 낭비를 막는것이 좋다.

y=np.arange(5,10)
y1=y
print(id(y), id(y1))
print(y==y1)
print(id(y)==id(y1))
y=np.arange(5,10)
print(id(y),id(y1))
print(y==y1)
print(id(y)==id(y1))

140081308602160 140081308602160
[ True  True  True  True  True]
True
140083456924408 140081308602160
[ True  True  True  True  True]
False

또한 위처럼 업데이트를 통한 방식은 위와같은 혼선을 낳을 수 있다. 해결방법은 아래와 같다. 사전에 할당된 배열의 slice 방식을 이용하는 것이다.

# 1. 새로운 변수를 이용
z=np.zeros_like(y)
print('id(x): ', id(x))
z[:]=x+y
print('id(x): ', id(x))

# 2. 데이터를 보존할 필요가 없다면
print('id(x): ', id(x))
x += y
# 'x[:]=x+y' 와 같다
print('id(x): ', id(x))

id(x):  140081307849496
id(x):  140081307849496
id(x):  140081307849496
id(x):  140081307849496

MXNet 데이터와 다른 Object의 상호 변환은 아래와 같이 진행할 수 있다.

_mxnet=np.arange(3)
print('np.arange(3)', type(_mxnet))
_numpy=_mxnet.asnumpy()
print('_mxnet.asnumpy()', type(_numpy))
_mxnet=np.array(_numpy)
print('np.array(_numpy)', type(_mxnet))
import torch
_torch=torch.tensor(_numpy)
print('torch.tensor(_numpy)', type(_torch))
_numpy=_torch.numpy()
print('_torch.numpy()', type(_numpy))
print(_mxnet.shape)
print(_numpy.shape)
print(_torch.shape)

np.arange(3) <class 'mxnet.numpy.ndarray'>
_mxnet.asnumpy() <class 'numpy.ndarray'>
np.array(_numpy) <class 'mxnet.numpy.ndarray'>
torch.tensor(_numpy) <class 'torch.Tensor'>
_torch.numpy() <class 'numpy.ndarray'>
(3,)
(3,)
torch.Size([3])

2. Data Preprocessing¶

pandas를 이용하여 데이터를 다뤄보자.

import os
def make_tmp_csv(path):
  path=os.path.join(path, 'empty.csv')
  with open(path, 'w') as f:
    f.write('NumRooms,Alley,Price\n')  # Column names
    f.write('NA,Pave,127500\n')  # Each row represents a data point
    f.write('2,NA,106000\n')
    f.write('4,NA,178100\n')
    f.write('NA,NA,140000\n')
  return path

filepath=make_tmp_csv('./')
!ls

empty.csv  gdrive  sample_data

import pandas as pd

data=pd.read_csv(filepath)
print(data)

   NumRooms Alley   Price
0       NaN  Pave  127500
1       2.0   NaN  106000
2       4.0   NaN  178100
3       NaN   NaN  140000

# nan 값을 처리할 때 2가지 방법을 이용할 수 있다. 지우거나, 채울 수 있는데 위와같은 경우는 데이터가 부족하므로 채우는 것이 좋다 Room number  데이터만을 채울 것 이므로
inputs, outputs = data.iloc[:, 0:2], data.iloc[:, 2]
inputs=inputs.fillna(inputs.mean())
print(inputs.mean())
print(inputs)
# 똑똑하게도 이산형 데이터의 NaN은 채우지 않는다
# 또한 get_dummies를 이용하여 가변수도 만들 수 있다. 아래의 경우 Alley_Pave, Ally_nan
inputs=pd.get_dummies(inputs, dummy_na=True)
print(inputs)

NumRooms    3.0
dtype: float64
   NumRooms Alley
0       3.0  Pave
1       2.0   NaN
2       4.0   NaN
3       3.0   NaN
   NumRooms  Alley_Pave  Alley_nan
0       3.0           1          0
1       2.0           0          1
2       4.0           0          1
3       3.0           0          1

from mxnet import np
print(inputs.values, type(inputs.values))
print(outputs.values, type(outputs.values))

x,y=np.array(inputs.values), np.array(outputs.values)
print(x, type(x))
print(y, type(y))

[[3. 1. 0.]
 [2. 0. 1.]
 [4. 0. 1.]
 [3. 0. 1.]] <class 'numpy.ndarray'>
[127500 106000 178100 140000] <class 'numpy.ndarray'>
[[3. 1. 0.]
 [2. 0. 1.]
 [4. 0. 1.]
 [3. 0. 1.]] <class 'mxnet.numpy.ndarray'>
[127500 106000 178100 140000] <class 'mxnet.numpy.ndarray'>

이외에도, 그냥 nan값을 drop하여 데이터 전처리를 진행할 수도 있다.

import pandas as pd

data=pd.read_csv(filepath)
print(data)

   NumRooms Alley   Price
0       NaN  Pave  127500
1       2.0   NaN  106000
2       4.0   NaN  178100
3       NaN   NaN  140000

# 이의 경우 그냥 dropna를 사용하면 좋지 않다
print(data.dropna(axis=0), '\n*********')
print(data.dropna(axis=1))

Empty DataFrame
Columns: [NumRooms, Alley, Price]
Index: [] 
*********
    Price
0  127500
1  106000
2  178100
3  140000

# 1. 직접 지정한다
print(data.dropna(subset=['NumRooms']))

# 2. 이산 데이터의 nan을 가변수를 이용하여 변환한다
data=pd.get_dummies(data, dummy_na=True)
print(data.dropna(axis=0))

   NumRooms Alley   Price
1       2.0   NaN  106000
2       4.0   NaN  178100
   NumRooms   Price  Alley_Pave  Alley_nan
1       2.0  106000           0          1
2       4.0  178100           0          1

3. Html로 저장¶

! ls '/content/gdrive/My Drive/Colab Notebooks/2.2데이터 조작.ipynb'
! jupyter nbconvert --to html '/content/gdrive/My Drive/Colab Notebooks/2.2데이터 조작.ipynb'

'/content/gdrive/My Drive/Colab Notebooks/2.2데이터 조작.ipynb'
[NbConvertApp] Converting notebook /content/gdrive/My Drive/Colab Notebooks/2.2데이터 조작.ipynb to html
[NbConvertApp] Writing 318063 bytes to /content/gdrive/My Drive/Colab Notebooks/2.2데이터 조작.html

데이터 전처리와 가공법

1.Getting Start with MXNET¶

2. Data Preprocessing¶

3. Html로 저장¶

Author: 황 유진

Leave a Reply Cancel reply

Conference Deadline

NEW POST

New Comment