파이썬/snippet

[파이썬/snippet] pandas & numpy snippet모음

gumin00 2022. 10. 2. 15:00

snippet.html

1.03MB

snippet.ipynb

0.26MB

데이터 불러오기, 데이터프레임 편집¶

라이브러리 import¶

In [ ]:

%matplotlib inline
# %matplotlib nbagg
    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.figsize'] = [10, 6]

# Suppress Deprecation and Incorrect Usage Warnings 
import warnings
warnings.filterwarnings('ignore')

Pandas 출력을 옆으로 출력¶

In [ ]:

# from IPython.display import display, HTML
# CSS = """
# .output {
#     flex-direction: row;
# }
# """
# HTML('<style>{}</style>'.format(CSS))

기본 글꼴 변경¶

In [ ]:

import matplotlib as mpl
mpl.font_manager._rebuild()
mpl.pyplot.rc('font', family='NanumBarunGothic')

# allow multiple outputs
# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"

# allow image input
from IPython.display import Image

csv 필드값이 매우 큰 경우 에러 해결¶

출처: https://yganalyst.github.io/data_handling/memo_7/
에러메시지: OverflowError: Python int too large to convert to C int

In [ ]:

import sys
import csv  
maxparse = sys.maxsize
while True:
    try:
        csv.field_size_limit(maxparse)
        break
    except OverflowError:
        maxparse = int(maxparse/10)
        # 10단위로 나누며 maxsize결정

학생정보 엑셀파일 불러오기¶

In [ ]:

# folder =  "Z:/200. 연락처/isus_db/학생정보.xlsx"
folder =  "Z:/공유 드라이브/IUDP_BASIC/200. 연락처/isus_db/학생정보.xlsx"
stu_info = pd.read_excel(folder, date_parser=['참가기간_시작','참가기간_종료','체류기간_시작','체류기간_종료','졸업일'], sheet_name='IUDPStuInfo')
# parse_dates=True
stu_info.shape  

Out[ ]:

(567, 46)

이미지 불러오기¶

In [ ]:

Image("img/read_csv함수의 인수.png")

from IPython.display import Image 패키지 실행
code안 출력용: Image("img/picture.png") 를 code안에 넣고 실행
markdown용: ![title](img/picture.png)를 markdown안에 넣고 실행

데이터프레임 편집¶

출처: https://github.com/justmarkham/pandas-videos/blob/master/top_25_pandas_tricks.ipynb#scrollTo=oYSvgYSE7s1_

데이터프레임 사이즈 축소¶

'usecols='' 로 필요한 열만 불러오기
'dtype='로 범주형 데이터 포함 object 데이터 타입열을 category 데이터 타입으로 변환

In [ ]:

cols = ['beer_servings', 'continent']
dtypes = {'continent':'category'}
smaller_drinks = pd.read_csv('http://bit.ly/drinksbycountry', usecols=cols, dtype=dtypes)
smaller_drinks.info(memory_usage='deep')

복수파일의 데이터셋에서 단일 데이터프레임 생성(행방향): glob 패키지 사용¶

stock_files = sorted(glob('data/stocks*.csv'))
pd.concat((pd.read_csv(file) for file in stock_files), ignore_index=True)

In [ ]:

# data 폴더에 행과 열이 같은 stocks1.csv, stocks2.csv, stocks3.csv 이 있다고 가정
from glob import glob
stock_files = sorted(glob('data/stocks*.csv'))
pd.concat(
  (pd.read_csv(file) for file in stock_files), ignore_index=True
)

복수파일의 데이터셋에서 단일 데이터프레임 생성(열방향): glob 패키지 사용¶

drink_files = sorted(glob('data/drinks*.csv'))
pd.concat((pd.read_csv(file) for file in drinks_files), axis='columns')

In [ ]:

# data 폴더에 행이 같고 열이 다른 drinks1.csv, drinks2.csv, drinks3.csv 이 있다고 가정
from glob import glob
drink_files = sorted(glob('data/drinks*.csv'))
pd.concat(
  (pd.read_csv(file) for file in drinks_files), axis='columns'
)

클립보드로 데이터프레임 만들기¶

pd.read_clipboard()

In [ ]:

df = pd.read_clipboard()
df

In [ ]:

데이터프레임을 두 개의 랜덤 데이터셋으로 나누기¶

subset1 = data.sample(frac=0.75, random_state=1234)
subset2 = movies.drop(subset1.index)

75%/25% 비율의 2개의 무작위 데이터 셋 만들기

In [ ]:

movies = pd.read_csv('http://bit.ly/imdbratings')
len(movies)
movies_1 = movies.sample(frac=0.75, random_state=1234)
# 비율이 아닌 갯수로 샘플링할 땐, 'frac=' 대신 'n=갯수' 사용

In [ ]:

movies_2 = movies.drop(movies_1.index)

In [ ]:

데이터프레임 특정열 내 값유무에 따른 필터링¶

data[data.column.isin(['value1', value2', 'value3'])]

In [ ]:

movies = pd.read_csv('http://bit.ly/imdbratings')
movies.genre.unique()

In [ ]:

movies[movies.genre.isin(['Action', 'Drama', 'Western'])].head() # 또는

# movies[(movies.genre == 'Action') | 
#        (movies.genre == 'Drama') | 
#        (movies.genre == 'Western')].head() 

갯수 많은 데이터순으로 필터링¶

data[ data.column.isin( data.genre.value_counts().nlargest(3).index ) ]

In [ ]:

movies[
    movies.genre.isin(movies.genre.value_counts().nlargest(3).index)
].head()

칼럼명의 문자열을 나눠 multiple 칼럼 생성¶

data.column.str.split(' ', expand=True)

In [ ]:

df = pd.DataFrame({'name':['John Arthur Doe', 'Jane Ann Smith'],
                   'location':['Los Angeles, CA', 'Washington, DC']})
df

In [ ]:

df[['first', 'middle', 'last']] = df.name.str.split(' ', expand=True)
df

In [ ]:

df['city'] = df.location.str.split(', ', expand=True)[0]
df

인코딩 확인¶

In [ ]:

import chardet

In [ ]:

def find_encoding(fname):
    r_file = open(fname, 'rb').read()
    result = chardet.detect(r_file)
    charenc = result['encoding']
    return charenc

과학적 표기법¶

In [ ]:

# 출처: https://financedata.github.io/posts/pandas-display-format.html

# 과학적 표기법(Scientific notation)을 사용하지 않는 경우
pd.options.display.float_format = '{:.2f}'.format

# 과학적 표기법(Scientific notation)
pd.set_option('display.float_format', '{:.2e}'.format)

pd.set_option('display.float_format', '{:.2f}'.format) # 항상 float 형식으로
pd.set_option('display.float_format', '{:.2e}'.format) # 항상 사이언티픽
pd.set_option('display.float_format', '${:.2g}'.format)  # 적당히 알아서

# 지정한 표기법을 원래 상태로 돌리려면 None을 지정
pd.set_option('display.float_format', None)

# 과학적 표기법으로 나오는 숫자를 일반형식으로 나오게 하는 법  <.str()>

In [ ]:

np.set_printoptions(precisions=2)
np.set_printoptions(formatter={"float_kind": lambda x: "%g" % x})

#     반올림,올림,내림,정수화
ms["Percent"].round(2); np.ceil(ms["Percent"]), # np.floor() / np.trunc()

# 자리수 변경
ms["Percent"].apply(lambda x: "{0:0.2f}".format(x)),
ms["Percent"].apply(lambda x: "%0.2f" % x),

단어수세기¶

In [ ]:

# 단어 수
petitions['title'].apply(lambda x: len(str(x).split()))
# 중복을 제거한 단어 수
petitions['title'].apply(lambda x: len(set(str(x).split())))

항목의 없는 원소 추출 (연산자 not 사용)¶

df[-df["column"].isin(["value"])]
df[~df["column"].isin(["value"])]
df[df["column"].isin(["value"]) == False]
df[np.logical_not(df["column"].isin(["value"]))]

apply 변환¶

행이나 열 단위로 더 복잡한 처리를 하고 싶을 때는 사용
인수로 행 또는 열을 받는 함수를 apply 메서드의 인수로 넣으면 각 열(또는 행)을 반복하여 그 함수에 적용시킴

In [ ]:

df3 = pd.DataFrame({
    'A': [1, 3, 4, 3, 4],
    'B': [2, 3, 1, 2, 3],
    'C': [1, 5, 2, 4, 4]
})

# 각 열의 최대값과 최소값의 차이를 구할 때
df3.apply(lambda x: x.max() - x.min())

# 각 행의 최대값과 최소값의 차이를 구할 때
df3.apply(lambda x: x.max() - x.min(), axis=1)

# 각 열에 대해 어떤 값이 얼마나 사용되었는지 확인할 때
df3.apply(pd.value_counts)

#NaN 값은 fillna 메서드로 원하는 값으로 변환하고,  astype 메서드로 전체 데이터 자료형 변환도 가능
df3.apply(pd.value_counts).fillna(0).astype(int)

주제별 정리¶

null값 (누락값) 처리¶

특정 열에 null 있는지 여부만 확인 <.hasnans> / 특정열의 각 원소 null 여부 boolean 반환 <np.isnan(Series)>¶

In [ ]:

airplane_info['총정원'].hasnans

In [ ]:

np.isnan(airplane_info['총정원']) 
# = airplane_info['총정원'].apply(np.isnan)

In [ ]:

null값 제외
<Series[Series != 'nan']
Series[np.logical_not(Series == 'nan')]
Series[-(Series == 'nan')]
Series[~(Series == 'nan')]
Series[(Series == 'nan') == False] >

In [ ]:

s[s != 'nan']

In [ ]:

null값 있는 행 또는 열 삭제
<df.dropna(axis=0, how='any', thresh=None(임계값,int), subset=None(array-like), inplace=False)>
Series.dropna 의 경우, Series 전체 아닌 null 값만 삭제

In [ ]:

airplane_info = pd.read_csv('C:/Users/Gumin JUNG/OneDrive - 서울시립대학교/jupyter_notebook/실습자료-손에잡히는판다스/data/airplane_info.csv',
                            engine='python')
airplane_info['총정원'].tail(3)

In [ ]:

airplane_info.dropna(axis=1)

In [ ]:

null값 개수 확인
<Series[Series != 'nan'].value_counts()>

In [ ]:

s[s != 'nan'].value_counts()

In [ ]:

null값 위치를 고정하고 sorting
<.sort_values(by='a', na_position='last')>

In [ ]:

df.sort_values(by='a', na_position='last') # na_position='first'

In [ ]:

null 값 포함해 연산(합계)
<df.sum(skipna=False)>

In [ ]:

df.sum(skipna=False)

In [ ]:

null값 다른 값으로 변경
<.fillna('대체할값')>

s.fillna['value=None', 'method=None', 'axis=None', 'inplace=False', 'limit=None', 'downcast=None']

value: null값을 대체할 값. dict 형태로 행 또는 열마다의 대체값 별도 지정 가능
method:
- null값 발생 전 값으로 변경: 'ffill(or pad')
- null값 발생 후 값으로 변경: 'bfill(or backfill')
axis: 데이터프레임에서 null 값을 채우는 기준 방향으로 0은 행방향, 1은 열방향
inplace: True 는 null 값을 대체한 결과값으로 데이터프레임 변경
limit: method를 연속적으로 수행하는 최대수
downcast: 데이터타입 변경

In [ ]:

import numpy as np
import pandas as pd
s = pd.Series([0, 1, np.nan, 3])

In [ ]:

s.fillna('No')
s.fillna(method='pad')
s.fillna(method='bfill')
stu_info.fillna(" ", axis=1, inplace=True)

In [ ]:

stu_info_pre = \
stu_info.fillna(
    {'국외여행합계': 0, 
     '논문평점': stu_info.loc[stu_info['논문평점'] != np.nan, '논문평점'].mean(),
     '자국내 주소': stu_info['국가']}
).replace({'자국내 주소':'정보없음'}, {'자국내 주소': stu_info['국가']})
# '국외여행합계' 열의 null값은 0, 
#'논문평점' 열의 null값은 '논문평점' 열의 값이 있는 경우의 점수들의 평균
# '자국내 주소' 열의 null값은 '국가' 열의 값

stu_info_pre. to_excel("C:/Users/Gumin JUNG/Desktop/IUDP학생현황.xlsx", sheet_name="IUDPinfo")

In [ ]:

각 열 내 null값을 가진 셀의 합들의 합
<stu_info.isnull().sum().sum()>

stu_info.isnull().sum().sum()

In [ ]:

stu_info.isnull().sum().sum()

In [ ]:

null값 다른 값으로 변경 (.pivot_table에서)
<.pivot_table(values='Series', index='Series', columns='Series', fill_value = '대체할값')>

In [ ]:

petitions[petitions['votes'] > 200000].pivot_table(values='answered', 
  index='category', columns='duration', fill_value= 0)

In [ ]:

null값 보간처리 <.interpolate()>¶

처리방법: 전,후값의 평균값 / 후값이 없을 경우, 전값 그대로 반환

In [ ]:

s = pd.Series([0, 1, np.nan, 3])
s.interpolate()

In [ ]:

airplane_info = pd.read_csv('C:/Users/Gumin JUNG/OneDrive - 서울시립대학교/jupyter_notebook/실습자료-손에잡히는판다스/data/airplane_info.csv',
                            engine='python')
airplane_info['총정원'].tail(3)

In [ ]:

airplane_info['총정원_보간'] = airplane_info['총정원'].interpolate()
airplane_info[['총정원', '총정원_보간']][2105:2109]

In [ ]:

특정열의 각 원소 null 여부 boolean 반환
<np.isnan(Series) or Series.apply(np.isnan)> (특정 열 null 값 있는지 여부만 확인할 때는 Series.hasnans)

In [ ]:

np.isnan(airplane_info['총정원'])
# = airplane_info['총정원'].apply(np.isnan) 
# = airplane_info['총정원'].isnull()

In [ ]:

null값이 아닌 값의 칼럼별 개수 <.count()>¶

In [ ]:

airplane_info.count()

In [ ]:

null값의 칼럼별 개수 <.isnull().sum()>¶

In [ ]:

airplane_info.isnull().sum()

In [ ]:

null 아닌 값의 칼럼별 개수 <.notnull().sum()>¶

In [ ]:

airplane_info.notnull().sum()

In [ ]:

null값의 칼럼별 개수의 합계 <.isnull().sum().sum()>¶

In [ ]:

airplane_info.isnull().sum().sum()

In [ ]:

value_counts메소드에서 null값 빈도 표시 <.value_counts(dropna=False)>¶

In [ ]:

flight_route_info['비고'].value_counts(dropna=False).head()

In [ ]:

특정열 null값 삭제 (index속성을 사용) <Series.index[Series.apply(np.isnan)]>¶

In [ ]:

index_nan = airplane_info['총정원'].index[
            airplane_info['총정원'].apply(np.isnan)]

In [ ]:

airplane_info.drop(index=index_nan)

In [ ]:

null값을 다른 열의 null값으로 대체
<df.loc[df[A].isnull(),A] = df.loc[df[A].isnull(),B]>

A = null값 있는 열, B = null 값 대체할 값 가진 열¶

In [ ]:

import pandas as pd
import numpy as np
airplane = pd.read_csv('C:/Users/Gumin JUNG/OneDrive - 서울시립대학교/jupyter_notebook/실습자료-손에잡히는판다스/data/airplane_info.csv', 
                       engine='python')

In [ ]:

airplane.loc[airplane['총정원'].isnull(),'총정원'] = airplane.loc[airplane['총정원'].isnull(),'정원_Y']

In [ ]:

중복값 처리¶

중복값 여부 확인 <.duplicated(subset=None(column label or sequence of labels, keep='first')>¶

subset : 중복값 확인 시 고려되는 열
keep='first' : 중복이면 중복의 첫번째 값 duplicated 여부를 False로 반환
keep='last' : 중복이면 중복의 마지막 값 duplicated 여부를 False로 반환
keep=False : 중복이면 무조건 True 반환(나중에 drop_duplicates() 에서 keep 할 생각이 없다는 뜻)

In [ ]:

iris = sns.load_dataset('iris')

In [ ]:

iris.head()

In [ ]:

iris.duplicated(subset=['petal_length', 'petal_width']).head()

중복값의 칼럼별 개수 <.duplicated().sum()>¶

In [ ]:

iris.duplicated(subset=['petal_length', 'petal_width']).sum()

In [ ]:

중복된 열의 정보 삭제 <series.drop_duplicates()>¶

In [ ]:

iris['species'].unique()

In [ ]:

iris['species'].drop_duplicates()

In [ ]:

문자열 처리¶

한 셀에 ','로 구분 된 문자열 내용 분리하기¶

In [ ]:

mcq['LearningPlatformSelect'] = \
mcq['LearningPlatformSelect'].astype('str').apply(lambda x: x.split(','))
s = mcq.apply(
    lambda x: pd.Series(x['LearningPlatformSelect']),
    axis=1).stack().reset_index(level=1, drop=True)
s.name = 'platform'

In [ ]:

문자열 나누기 <.str.split('나누는기준문자')>¶

In [ ]:

ss = pd.Series(['가_나_다', '라_마_바', np.nan, '사_아_자'])

In [ ]:

ss.str.split('_')

In [ ]:

문자열 나눈 후 가져오기 <.str.split.get(가져올 문자열 번호) (or str[가져올 문자열 번호]>¶

In [ ]:

ss.str.split('_').str.get(1) # = ss.str.split('_').str[1]

In [ ]:

문자열 가져오기 <.str.get(가져올 문자열 번호) (or str[가져올 문자열 번호]>¶

In [ ]:

# ss.get(0) # '가_나_다'
ss.str.get(0) # =ss.str[0]

In [ ]:

문자열 나눈 후 데이터프레임 생성
<.str.split('나누는기준문자', expand=True, n=리스트에서 분리해 낼 원소수)>

In [ ]:

ss.str.split('_', expand=True) #: 분리 후 리스트 원소를 각 열의 값으로 보내 데이터프레임 생성

In [ ]:

ss.str.split('_', expand=True, n=1)

In [ ]:

ss.str.rsplit('_', expand=True, n=1)
# 오른쪽 기준으로 데이터프레임 만들기 때문에 왼쪽 열의 값을 마지막 열의 값이 처리한 나머지 보관 해 처리

In [ ]:

문자열 대체 <.str.replace("현재str", "대체str")>¶

In [ ]:

ss.str.replace('_', '%') # : 대체값 처리

In [ ]:

열내용을 대문자로 변환
Series.apply(str.upper)

In [ ]:

cars.loc[:, 'COUNTRY'] = cars.loc[:, 'country'].apply(str.upper)
[출처] Dictionary 자료구조와 Pandas 라이브러리|작성자 Gauss

In [ ]:

리스트/Dict/Series/정규식/실수 대체
<.replace(to_replace=['남', '여'], value=1)>

df.replace(to_replace=None, value=None, inplace=False, limit=None, regex=False, method='pad'): Replace values given in to_replace with value.

In [ ]:

stu_info['성별'].replace({'남':1, '여':2}) # '남'은 1, '여'는 2로 대체
#stu_info['성별'].replace(['남', '여'], 1) # '남','여' 모두 1로 대체

In [ ]:

stu_info['성명'].replace(r'^.',1, regex=True) #모든 문자 1로 변경
# r: 정규식 명령시작, ^: 시작문자, $: 끝문자, .: 모든문자

In [ ]:

stu_info.replace({'성명': r'^S..', '전공': r'...P$'}, 
                 {'성명': 'rao', '전공': 'SEP'}, regex=True)
# '성명'열에서 S로 시작하는 세 문자 는 rao 로, 
# '전공'열에서 P로 끝나는 문자열의 P포함한 직전 세 문자 는 SEP 으로 변경

In [ ]:

문자열 통합 <.str.cat(sep='연결구분자')>¶

In [ ]:

ss.str.cat(sep=',') #: 문자열 통합,  sep=',' - 연결시 사용되는 구분자

In [ ]:

ss.str.cat() # : 구분자 없으면 주어진 문자열로 처리

In [ ]:

ss.str.cat(['A', 'B', 'C', 'D']) # : 새인자 추가 해 붙이기

In [ ]:

시리즈를 리스트로 변환 <.tolist>¶

In [ ]:

op_eq_list = op_eq['member'].tolist

In [ ]:

특정 문자열 포함한 행에 대한 데이터프레임 추출
<.loc[Series.str.contains('str')]>

In [ ]:

question.loc[question['Column'].str.contains('LearningCategory')]

In [ ]:

DataFrame 내 자료형별 열 개수 구하기 <.get_dtype_counts()>¶

In [ ]:

college.get_dtype_counts()

In [ ]:

특정 자료형 열추출 <.select_dtypes(include=['자료형'])>¶

In [ ]:

# int64 dtype열만 불러오기
college.select_dtypes(include=['int64']).head()
# 숫자형 dtype열만 불러오기 (np.number = 'number' 동일)
college.select_dtypes(include=['number']).head()
# 법주형 dtype열만 불러오기 (np.object = 'object' 동일)
college.select_dtypes(include=['object']).head()

In [ ]:

index값 처리¶

내부 구성 값 크기별 정렬
<.nlargest(숫자), .nsmallest(숫자)>

In [ ]:

# 평균등록금 상위 5개 내림차순 정렬
college['평균등록금'].nlargest(5)
# 평균등록금 하위 5개 오름차순 정렬
college['평균등록금'].nsmallest(5)
# college DataFrame을 평균등록금 기준 상위 5개 내림차순 정렬
college.nlargest(5,'평균등록금')
# college DataFrame을 평균등록금 기준 하위 5개 오름차순 정렬
college.nsmallest(5,'평균등록금')

In [ ]:

임계치 지정 해 값 변환
<.clip, lower=숫자 , upper=숫자>

In [ ]:

df["A"].clip(lower=2, upper=8)

In [ ]:

실수 값을 카테고리 값으로 변환
<.cut(data, bins, labels=[]) / .qcut(data, 범주수, labels=[]>

실수 값을 크기 기준으로 하여 카테고리 값으로 변환할 때 사용
cut: 실수 값의 경계선 지정
qcut: 실수 값의 경계선 미지정, 갯수가 똑같은 구간으로 나눔

In [ ]:

ages = [0, 2, 10, 21, 23, 37, 31, 61, 20, 41, 32, 100]
bins = [1, 15, 25, 35, 60, 99]
labels = ["미성년자", "청년", "중년", "장년", "노년"]
cats = pd.cut(ages, bins, labels=labels)
cats
# cut 명령이 반환하는 값은 Categorical 클래스 객체 
# cat.categories 속성으로 라벨 문자열, cat.codes 속성으로 정수로 인코딩한 카테고리 값 가짐

df4 = pd.DataFrame(ages, columns=["ages"])
df4["age_cat"] = pd.cut(df4["ages"], bins, labels=labels)
df4

In [ ]:

data = np.random.randn(1000)
cats = pd.qcut(data, 4, labels=["Q1", "Q2", "Q3", "Q4"])
cats
pd.value_counts(cats)

In [ ]:

특정 컬럼과 특정 인덱스 동시 삭제
<.drop(columns=df.filter().columns, index=[]>

In [ ]:

# m 또는 y또는 o 로 시작하는 컬럼과, 0,1,2 인덱스 삭제
df.drop(columns=df.filter(regex='^[myo]').columns, index=[0,1,2])

In [ ]:

행 위치 이동
<.shift(이동할행[인덱스]숫자, , freq="시간단위인수")>

ts.shift(1, freq="W")

결측치 처리 시 주로 사용
시계열 인덱스 이동 시에도 사용

In [ ]:

df2['b'].shift(2)

In [ ]:

ts = pd.Series(np.random.randn(4), index=pd.date_range(
    "20180101", periods=4, freq="M"))
ts.shift(1, freq="W")  
# 시계열 인덱스에서 freq로 단위 지정 해 인덱스 이동 가능
# 첫번째 값의 인덱스가 다음(1) 주 일요일(W)만큼 앞당겨 짐

In [ ]:

특정기준으로 몇 번째 값인지 보여 줌
<.rank, method=기준이 되는 연산, pct=True 퍼센트>

In [ ]:

df["model_year"].rank(method='dense', pct=True) # method='min', 'first', 'dense' 

In [ ]:

.groupby연산결과가 Series일 때 DataFrame으로 추출
<.groupby(Series, as_index=False)[값Series].sum()>

In [ ]:

df.groupby("Team", as_index=False)["Points"].sum()

In [ ]:

MultiIndex DataFrame에서 index레벨 변경
<.swaplevel(i=-2, j=-1, copy=True)>

In [ ]:

import pandas as pd
ipl_data = {'Team': ['Riders', 'Riders', 'Devils', 'Devils', 'Kings',
         'kings', 'Kings', 'Kings', 'Riders', 'Royals', 'Royals', 'Riders'],
         'Rank': [1, 2, 2, 3, 3,4 ,1 ,1,2 , 4,1,2],
         'Year': [2014,2015,2014,2015,2014,2015,2016,2017,2016,2014,2015,2017],
         'Points':[876,789,863,673,741,812,756,788,694,701,804,690]}

df = pd.DataFrame(ipl_data)

In [ ]:

h_index = df.groupby(["Team", "Year"])["Points"].sum()
h_index
h_index.swaplevel(i=-2, j=-1, copy=True)

In [ ]:

MultiIndex DataFrame에서 index 정렬
<.sort_index(level=0)>

In [ ]:

h_index.swaplevel().sort_index(level=0)

In [ ]:

특정 값,위치 검색¶

특정열에서 특정문자 포함한 값 찾기
<Series.str.contains('찾는문자')>

In [ ]:

question['Column'].str.contains('LearningCategory')

In [ ]:

특정열에서 특정값 찾기
<Series.isin(['특정값1, 특정값n'])>

In [ ]:

question['Column'].str.contains('LearningCategory')

In [ ]:

특정 열 내에 특정 값 있는 경우, 해당 행 전체를 반환
<.query('검색대상열 == 검색대상값')>

In [ ]:

airplane.query('총정원 == 180')

In [ ]:

특정 행열 위치 값 변경
<.set_value('행명', '열명', '대체할값')>

In [ ]:

airplane.set_value(100, '총정원', 180)

In [ ]:

특정 값 행렬위치로 검색 <.lookup([행레이블1,행레이블n],[열레이블1, 열레이블n])>¶

In [ ]:

diamonds = sns.load_dataset('diamonds')
diamonds.lookup([3,0],['clarity', 'depth'])

In [ ]:

특정 행렬위치 레이블로 열 또는 데이터프레임 반환
<.xs(행[열]레이블, axis=0, level=None[반환수준], drop_level=True[False:동일수준반환])>

In [ ]:

df = pd.DataFrame({'A':[4, 4, 9], 'B':[5, 0, 7], 'C':[2, 9, 3]}, 
                  index=['a','b','c'])
print(df.xs('a'))
print(df.xs('C', axis=1))

In [ ]:

멀티인덱스 검색으로 열 또는 데이터프레임 반환
<.xs((행[열]레이블 1, 행[열]레이블 n), axis=0)>

np.round(a, decimals=0, out=None)
study_data.xs(('지원','컴공'), axis=1)

In [ ]:

r_inx = pd.MultiIndex.from_product(
    [[2017,2018],[1,2]], names=['년도','과제점수'])
c_inx =  pd.MultiIndex.from_product(
    [['철수','영희', '지원'],['컴공', '경제']], names=['학생', '학과'])
data = np.round(np.abs(np.random.randn(4,6)),1)
study_data = pd.DataFrame(data, index=r_inx, columns=c_inx)

study_data.xs((2017,1)) # '년도','과제점수' 에서 두 번째 레벨에 맞춰 처리 
study_data.xs(('지원','컴공'), axis=1) # 열 단위 처리를 위해선 axis=1 추가

In [ ]:

특정범위 내 특정 값 있는 경우, 불리언값 반환
<.between(시작값,종료값)>

In [ ]:

mask = airplane['총정원'].between(1,120)

In [ ]:

특정열을 범주화한 열 생성하기
<df[생성열] = pd.Categorical(np.where(조건, True일때범주명, False일때범주명)>

In [ ]:

airplane['category_R'] = ""

In [ ]:

airplane['category_R']= pd.Categorical(np.where(mask,'소형', '중형'))

In [ ]:

Groupby 활용¶

groupby(by=None, axis=0, level=None, as_index=True, sort=True, group_keys=True, squeeze=False, **kwargs)

by : mapping, function, str, or iterable
axis : int, default 0
level : If the axis is a MultiIndex (hierarchical), group by a particular level or levels
as_index : For aggregated output, return object with group labels as the index. Only relevant for DataFrame input. as_index=False is effectively “SQL-style” grouped output
sort : Sort group keys. Get better performance by turning this off. Note this does not influence the order of observations within each group. groupby preserves the order of rows within each group.
group_keys : When calling apply, add group keys to index to identify pieces
squeeze : Reduce the dimensionality of the return type if possible, otherwise return a consistent type

Groupby에 자주 이용되는 함수

size, count: 갯수
mean, median, min, max: 평균, 중앙값, 최소, 최대
sum, prod, std, var, quantile : 합계, 곱, 표준편차, 분산, 사분위수
first, last: 그룹 데이터 가장 첫번째 데이터와 가장 나중 데이터
agg, aggregate
- 사용자함수를 만들고 agg에 전달 가능
- 여러 그룹연산 동시수행할 경우, 함수 이름 문자열의 리스트를 agg에 전달
describe: 하나의 그룹 대표값이 아닌 여러개 값을 데이터프레임으로 구함
apply: 하나의 대표값이 아닌 데이터프레임 출력. 원하는 그룹연산 없는 경우 사용
transform: 그룹 대표값 아닌 그룹별 계산으로 데이터 자체를 변형

In [ ]:

Groupby 에 의해 split 된 정보 추출
<df.groupby(Series)>

In [ ]:

import pandas as pd 
import numpy as np

ipl_data = {'Team': ['Riders', 'Riders', 'Devils', 'Devils', 'Kings',
         'kings', 'Kings', 'Kings', 'Riders', 'Royals', 'Royals', 'Riders'],
         'Rank': [1, 2, 2, 3, 3,4 ,1 ,1,2 , 4,1,2],
         'Year': [2014,2015,2014,2015,2014,2015,2016,2017,2016,2014,2015,2017],
         'Points':[876,789,863,673,741,812,756,788,694,701,804,690]}

df = pd.DataFrame(ipl_data)

In [ ]:

name_group = df.groupby("Team")

In [ ]:

for name, group in name_group: 
    print(name)
    print(group)
# 그룹화 객체로 for ~ in 실행하면 
# 그룹화 된 Series값이 name / 해당 Series의 나머지 데이터가 group으로 추출

In [ ]:

grouped_Series의 group별 key값, 특정 group의 index 및 데이터프레임 추출
<.groups.keys() / .groups["특정그룹의key"] / .get_group("특정그룹의key") / .first()>

GroupBy 클래스 객체에는 각 그룹 데이터의 인덱스를 저장한 groups 속성 활용

df.groupby("Team").groups.keys(): group별 key 값

df.groupby("Team").groups["특정그룹의key"]: 특정 group index

df.groupby("Team").get_group("특정그룹의key"): 특정 group df

df.groupby("Team").first(): group별 1번째 값

In [ ]:

df.groupby("Team").groups

df.groupby("Team").groups.keys()
df.groupby("Team").groups["Riders"]
df.groupby("Team").get_group("Riders")
df.groupby("Team").first()

In [ ]:

groupby 객체 연산을 pivot형태로 변경
<df.groupby("Team").연산().unstack()>

df.groupby("Team").mean().unstack()

In [ ]:

df.groupby(["Team", "Year"])["Points"].mean().unstack()

In [ ]:

grouped_Series에 apply - .agg(), .transform(), .filter()¶

.agg():

요약된 통계정보 추출
특정 컬럼에 여러개의 function을 Apply 가능

In [ ]:

df.groupby("Team").agg(sum) # sum대신 'sum', 'np.sum' 가능
df.groupby("Team").agg(np.mean) # np.mean대신 'mean' 가능

In [ ]:

df.groupby(["Team"]).agg([sum, 'mean', np.std])
df.groupby(["Team"]).agg(
    {"Points":sum, "Rank":min, "Year":"count"}) # groupby대상이 인덱스가 됨
df.groupby(["Team"], as_index=False).agg(
    {"Points":sum, "Rank":min, "Year":"count"}) # groupby대상이 인덱스되지 않음

In [ ]:

df.groupby(["Team"]).agg(
    {"Points": [min, max, sum],
     "Rank":"count", 
     "Year":["first", "unique","nunique"]})

In [ ]:

.transform():

Aggregation과 달리 key값 별로 요약된 정보가 아님
개별 데이터의 변환을 지원함
max 나 min 처럼 Series 데이터에 적용되는 데이터들은 key 값을 기준으로 Grouped 된 데이터 기준

In [ ]:

df.groupby(["Team"]).agg(np.mean)
df.groupby(["Team"]).transform(np.mean)

In [ ]:

.filter():

특정 조건으로 데이터를 검색할 때 사용
filter 안에는 boolen 조건이 존재해야함
len(x)는 grouped된 dataframe 개수

In [ ]:

df.groupby(["Team"]).agg(lambda x: len(x) >= 3)
df.groupby(["Team"]).filter(lambda x: len(x) >= 3)

In [ ]:

grouped_Series의 기본 연산
<df.groupby("Team").연산메소드(level=연산의 기준이 되는 Series 번호)>

In [ ]:

h_index = df.groupby(["Team", "Year", "Rank"])["Points"].sum()

In [ ]:

h_index.sum(level=1)

In [ ]:

grouped_Series의 index 레벨 변경
<grouped_Series.연산메소드().swaplevel((i=-2, j=-1, copy=True): 멀티인덱스 i와 j 간 레벨변경

In [ ]:

h_index; h_index.swaplevel(0,2, copy=True)

In [ ]:

grouped_Series 연산을 돕는 Multiindex정리방법 1:
droplevel(level=0): 대상Index레벨낮춤

In [ ]:

grouped = df.groupby("Team").agg({"Points": [sum, 'mean', np.std]})
grouped.columns = grouped.columns.droplevel(level=0)
grouped = grouped.rename(columns={
    "sum": "Points_sum", "mean": "Points_mean", "std": "Points_std"
})
grouped.head()

In [ ]:

grouped_Series 연산을 돕는 Multiindex정리방법 2:
ravel(): MultiIndex를 단순배열로 변경

In [ ]:

grouped = df.groupby("Team").agg({"Points": [sum, 'mean', np.std]})
grouped.columns = ["_".join(x) for x in grouped.columns.ravel()]
grouped.head()

In [ ]:

grouped_Series의 index 기준으로 정렬
<grouped_Series.연산메소드().sort_index(level=정렬의 기준이 되는 Series 번호)>

In [ ]:

h_index.sort_index(level=1)

In [ ]:

피벗테이블 (groupby와 비교)¶

Index 축은 groupby와 동일
Column에 추가로 labelling 값을 추가
Value에 numeric type 값을 aggregate 하는 형태 참조: https://wesmckinney.com/blog/fast-and-easy-pivot-tables-in-pandas-0-5-0/

stu_info.pivot_table(values=None, index=None, columns=None, aggfunc='mean', fill_value=None, margins=False, dropna=True, margins_name='All')

data: 분석할 데이터프레임 (메서드일 때는 필요하지 않음)
values: 분석할 데이터프레임에서 분석할 열
index: 행 인덱스로 들어갈 키 열 또는 키 열의 리스트
columns: 열 인덱스로 들어갈 키 열 또는 키 열의 리스트
aggfunc: 분석 메서드
fill_value: NaN 대체 값
margins: 모든 데이터를 분석한 결과를 오른쪽과 아래에 붙일지 여부
margins_name: 마진 열(행)의 이름

In [ ]:

stu_info['성별'] = stu_info['성별'].replace(
    {'남 ' : '남', '여 ': '여', '정보없음':'여'})

In [ ]:

# 전공별 학생수
## Pivot
stu_info.pivot_table(values='성명', index='전공', 
                     aggfunc='count').rename(
                    columns={'성명':'학생수'})

## groupby
stu_info.groupby("전공")[['성명']].agg('count').rename(columns={'성명':'학생수'})

In [ ]:

# 전공별 학생수(성별추가)
## Pivot
stu_info.pivot_table(values='성명', index='전공', 
                     columns='성별', aggfunc='count')
## groupby
stu_info.groupby(["전공","성별"])["성명"].count().unstack()

In [ ]:

#  대륙별/국별 / 전공별 학생수(조건:서울시립대,합계포함)
## Pivot
stu_info[stu_info['참가기간(시작)'] > '2012/01'].pivot_table(
    values='성명', index=['대륙','국가'], columns='전공',
    aggfunc='count', fill_value='-', margins=True)
## groupby
stu_info[stu_info['참가기간(시작)'] > '2012/01'].groupby(
    ['대륙', '국가', '전공'])['성명'].count().unstack().fillna('-')

In [ ]:

#  대륙별/국별 /전공별 /성별 학생수 및 졸업여부
## Pivot
stu_info[stu_info['참가기간(시작)'] > '2012/01'].pivot_table(
    index=['대륙','국가'], columns=['전공','성별'],
    aggfunc={'성명': 'count', '졸업일':'count'}, fill_value='-').rename(
    columns={'성명':'학생수', '졸업일':'졸업여부'}).replace(0,'-')
## groupby
stu_info[stu_info['참가기간(시작)'] > '2012/01'].groupby(
    ['대륙','국가','전공','성별'])[['성명','졸업일']].count(
    ).unstack(level=2).unstack().fillna('-').rename(
     columns={'성명':'학생수', '졸업일':'졸업여부'}).replace(0,'-')

In [ ]:

# aggfunc=np.sum함수로 합계를 위해
# stu_info['성명'] 모두 1로 변경
name = \
stu_info['성명'].replace(
                        r'^.', 1, regex = True)
#동일한 결과:
#stu_info['성명'].replace(stu_info[stu_info['성명'] != ""]['성명'], 1)

dateutil 패키지의 parse 명령: 자동으로 형식 문자열 찾아 datetime.datetime 클래스 객체로 변환

In [ ]:

import dateutil
df_phone = pd.read_csv("./data/phone_data.csv")
df_phone['date'] =df_phone['date'].apply(dateutil.parser.parse, dayfirst=True) 
# dayfirst=True 유럽식 일자표기(일/월)

df_phone.pivot_table(["duration"],
                index = [df_phone.month, df_phone.item],
                columns = df_phone.network, aggfunc="sum", fill_value=0)

In [ ]:

날짜,시간 인덱스¶

날짜 배열 빈도 기준 확인
<날짜객체.freq>

idx_d.freq

In [ ]:

import pandas as pd
idx_d = pd.Index(pd.date_range('20130101',periods=3))
# pd.DatetimeIndex(['20130101', '20130102', '20130103']) 
# freq=None 인 것 외에는 같은 인덱스 배열
idx_d.freq

In [ ]:

날짜시간 인덱스생성과 빈도수 지정 코드¶

pd.date_range('20180703', '20191203', freq = 'H') pd.date_range('20180703', freq = 'H', periods=4): period가 지정되면 enddate 인수는 없음

period 에 freq 인수에 따른 빈도수를 지정한다
freq 인수는 아래와 같다
- Y[A]:연말, YS[AS]:연초, Q:분기말, QS:분기초, M:월말, MS:월초, W:주일, D:일
- B:영업일, BY[A]:영업기준연말, BYS[AS]:영업기준연초, BQ:영업기준분기말, BQS:영업기준분기초, BM: 영업기준월말, BMS:영업기준월초
- W-MON: 주(월요일), WOM-2THU: 각 달의 두번째 목요일, Q-JAN: 각 분기의 첫달의 마지막 날, Q-DEC: 각 분기의 마지막 달의 마지막 날
- H:시, T:분, S:초, MS:밀리초, US:마이크로초, NS:나노초, PS:피코초, FS:펨토초

In [ ]:

Image("./img/date offset.png")

Out[ ]:

In [ ]:

pd.date_range('2018/07/03', freq = 'H', periods=4)

In [ ]:

pd.date_range('20160703', freq = 'BMS', periods=4)

In [ ]:

날짜시간 차 인덱스생성 <pd.timedelta_range>¶

pd.date_range('20180703', '20191203', freq = 'H')
pd.date_range('20180703', freq = 'H', periods=4): period가 지정되면 enddate 인수는 없음

period 에 freq 인수에 따른 빈도수를 지정한다
freq 인수는 아래와 같다
- Y[A]:연말, YS[AS]:연초, Q:분기말, QS:분기초, M:월말, MS:월초, W:주일, D:일
- B:영업일, BY[A]:영업기준연말, BYS[AS]:영업기준연초, BQ:영업기준분기말, BQS:영업기준분기초, BM: 영업기준월말, BMS:영업기준월초
- H:시, T:분, S:초, MS:밀리초, US:마이크로초, NS:나노초, PS:피코초, FS:펨토초

In [ ]:

pd.date_range('2018/07/03', freq = 'H', periods=4)

In [ ]:

주기 나타내는 인덱스로 변경(freq 변경)
<pd.to_period('D')>

dr3.to_period('D')

In [ ]:

dr3 = pd.date_range('2018-08-03', periods=24, freq='H')
dr3.to_period('D')

In [ ]:

주기 인덱스 생성
<pd.period_range('2018-01', periods=13, freq='M')>

pd.period_range('2018-01', periods=13, freq='M')

In [ ]:

dr_m = pd.period_range('2018-01', periods=13, freq='M')
dr_m

In [ ]:

시간 간격 인덱스 생성
<pd.timedelta_range(0, periods=10, freq='H')>

pd.timedelta_range(0, periods=10, freq='H')

In [ ]:

tm_1 = pd.timedelta_range(0, periods=10, freq='H')
tm_1

In [ ]:

시간 차이 계산 <tm1 - tm1[0]>¶

tm_1 - tm_1[1] (벡터화 연산)

In [ ]:

tm_1 - tm_1[1]

In [ ]:

dr2 = pd.date_range('2018-07-03', periods=8)
dr2

In [ ]:

dr2 - dr2[0]

In [ ]:

다운샘플링 시 sum/mean등의 그룹연산으로 대표값 구함
<.resample(단위시간).sum()>

data.resample('3h').sum() 1-> 3시간으로 그룹화 후 연산(sum)
data.resample('D').sum() 1시간-> 1일로 그룹화 후 연산(sum)
data.resample('W').mean() 1시간-> 1주로 그룹화 후 연산(mean)
data.loc['2012-10'].sum() 2012년 10월 한달 인도수 연산(sum)
data.asfreq('W') 기존 Index 빈도를 주 단위로 변경

In [ ]:

data = pd.read_csv('C:/Users/Gumin JUNG/OneDrive - 서울시립대학교/jupyter_notebook/실습자료-손에잡히는판다스/data/hanriver_bridge.csv', 
                   index_col='Date', parse_dates=True, engine='python')
data.head(2)

다운-샘플링은 원래 데이터가 그룹으로 묶여 그룹바이(groupby)때와 같이 그룹 연산 해서 대표값을 구함

In [ ]:

data.resample('D').sum().loc['2012-10-03']
data.resample('D').sum().head()
data.resample('W').mean().head()

data.loc['2012-10'].sum()

# asfreq는 현재 인덱스에서 동일한 값을 유지하며, DatetimeIndex를 다른 빈도로 변경하려는 경우에 사용
data.asfreq('W').head()

날짜가 아닌 시/분 단위에서
- 구간위 왼쪽 한계값(가장 빠른 값)은 포함
- 오른쪽 한계값(가장 늦은 값)은 포함하지 않음 (즉, 가장 늦은 값은 다음 구간에 포함됨)
  e.g. 10분 간격으로 구간 만들면 10의 배수가 되는 시각은 구간의 시작점이 됨

In [ ]:

ts = pd.Series(np.random.randn(60), index=pd.date_range(
    "2018-1-1", periods=60, freq="T"))
ts.head(20)
ts.resample('10T').sum()

왼쪽 아닌 오른쪽 한계값을 구간에 포함하려면 closed="right" 인수사용
이 때는 10의 배수가 되는 시각이 앞 구간에 포함 됨

In [ ]:

ts.resample('10T', closed="right").sum()

ohlc 메서드: 구간의 시고저종(open, high, low, close)값 구함

In [ ]:

ts.resample('5T').ohlc()

업-샘플링 시 ffill/bfill로 대표값 구함
<.resample(단위시간).ffill [or bfill] ()>

업-샘플링의 경우에는 실제로 존재하지 않는 데이터를 만들어야 하며 아래 두 방식 사용

ffill: 앞에서 나온 데이터를 뒤에서 그대로 쓰는 forward filling 방식
bfill: 뒤에서 나올 데이터를 앞에서 미리 쓰는 backward filling 방식

In [ ]:

ts.resample('30s').ffill().head(20)
ts.resample('30s').bfill().head(20)

In [ ]:

인덱스¶

index변경
df.rename({'기존index문자':'대체할index문자'})

In [ ]:

s = pd.Series([1,2,3,4])
s

In [ ]:

s_re = s.rename({0:'a',1:'b',2:'c',3:'d'})
s_re

In [ ]:

index 순서변경 <df.reindex('설정할index문자 [또는 리스트]')>¶

In [ ]:

# 'e' index를 가진 값 존재하지 않으므로 null처리
s_re.reindex(['b','a','c','e'])

In [ ]:

범주형 자료로 된 index를 .rename으로 정수로 변경 후 덧셈연산¶

In [ ]:

1. 범주형 index인 s3를 .rename 메소드로 정수형으로 변경 후 덧셈연산
s1 = pd.Series([10,20,30], index=[0,1,2])
s1.index.dtype # dtype('int64')
s3 = pd.Series([11,12,13], index=list('012'))
s3.index.dtype # dtype('O')
s3 = s3.rename({'0':0, '1':1,'2':2})
s1 + s3

2. 또는 새 시리즈 생성 해 index.values 속성에 가 astype 메소드로 int64 변환 후 덧셈연산
s4 = pd.Series([11,12,13], index=list('012'))
s4.index = s4.index.values.astype('int64')
s1 + s4

In [ ]:

1부터 시작하는 인덱스 만들기¶

In [ ]:

new_list = np.arange(1, 21)
df.set_index(new_list).rename(columns={'index': '구분'})

In [ ]:

A인덱스객체는 포함되나 B객체에 없는 레이블 검색 <A인덱스객체.difference(B인덱스객체)>¶

idx1.difference(idx2) 객체 간 동일 인덱스 검색

In [ ]:

idx1 = pd.Index([1, 2, 3, 4])
idx2 = pd.Index(range(1,4))
idx1.difference(idx2)

In [ ]:

범주형 인덱스 추가
<s.index.add_categories(새카테고리,inplace=False)>

s.index.add_categories(5) 범주형 인덱스 추가

In [ ]:

inx_i = pd.CategoricalIndex([1,2,3,4])
s = pd.Series([1,2,3,4],index=inx_i)
s;  s.index.add_categories(5)

In [ ]:

원하는 위치에 칼럼, 값 생성
<index.insert(loc, item) / df.insert(loc, column, value, allow_duplicates = False)>

df.insert(loc, column, value, allow_duplicates = False)
s.index.insert(loc, item)

stu_info["학과"] = "IUDP" 와 동일한 결과이나, insert 명령은 위치지정가능

In [ ]:

stu_info.insert(0, "학과", "IUDP")

In [ ]:

stu_info.index.insert(341, "추가")

In [ ]:

# 행별로 다른 데이터 삽입

입학년도 = pd.Series([])

for i in range(len(stu_info)):
    if stu_info['참가기간(시작)'][i].year == 2012:
        입학년도[i] = "2012 IUDP"
    elif stu_info['참가기간(시작)'][i].year == 2013:
        입학년도[i] = "2013 IUDP"
    elif stu_info['참가기간(시작)'][i].year == 2014:
        입학년도[i] = "2014 IUDP"
    elif stu_info['참가기간(시작)'][i].year == 2015:
        입학년도[i] = "2015 IUDP"
    elif stu_info['참가기간(시작)'][i].year == 2016:
        입학년도[i] = "2016 IUDP"
    elif stu_info['참가기간(시작)'][i].year == 2017:
        입학년도[i] = "2017 IUDP"
    elif stu_info['참가기간(시작)'][i].year == 2018:
        입학년도[i] = "2018 IUDP"
    else:
        입학년도[i] = "고려대  MPA"

stu_info.insert(0, 'Year', 입학년도)
stu_info.query('Year == "고려대  MPA"')
# 동일결과: stu_info.loc[stu_info['Year'].isin(["고려대  MPA"])]

In [ ]:

문자열수식으로 열 간 연산 수행 <.eval(str, inplace=False)>¶

In [ ]:

data.columns=['좌측', '우측']
data['합산'] = data.eval('좌측 + 우측') # data.eval('합산 = 좌측 + 우측') 와 동일한 결과
data.head()

In [ ]:

numpy 연산 <np.mean():평균, np.var():분산, np.var(x, ddof=1):비편향분산, np.std():표준편차, np.percentile()>¶

np.mean(x) 평균
np.var(x) 분산
np.median(x) 중앙값
np.percentile(x, 0) 최소값
np.percentile(x, 25) 1사분위수
np.percentile(x, 50) 중간값, 2사분위수
np.percentile(x, 75) 3사분위수
np.percentile(x, 100) 최대값
from scipy.stats import describe; describe(x) # scipy 패키지로 한 번에 기술 통계값

In [ ]:

x = np.array([18,   5,  10,  23,  19,  -8,  10,   0,   0,   5,   2,  15,   8,
              2,   5,   4,  15,  -1,   4,  -7, -24,   7,   9,  -6,  23, -13])
# 분산
np.var(x); 
# 아래 식과 동일한 결과
y = []
for i in range(len(x)):
    y.append([(x[i] - np.mean(x))**2])
np.sum(y)/len(x);

#중앙값: 데이터 수가 짝수이면 가장 가운데 두 수의 평균을 사용
np.median(x); 

# 사분위수
np.percentile(x, 0)  # 최소값
np.percentile(x, 25)  # 1사분위수
np.percentile(x, 50)  # 중앙값, 2사분위수
np.percentile(x, 75)  # 3사분위수
np.percentile(x, 100)  # 최대값

# scipy 패키지로 한 번에 기술 통계값 구하기
from scipy.stats import describe
describe(x)

In [ ]:

난수 생성

1. np.random.seed(x) / 2. np.random.rand(3,5) / 3. np.random.randn(3,5) / 4. np.random.randint(low, high=None, size=None)>¶

np.random.seed(x) 난수 수열생성을 위한 시작 숫자 (0=<x)
np.random.rand(3,5): 0부터 1사이의 균일 분포
np.random.randn(3,5): 가우시안 표준 정규 분포(기댓값 0, 표준편차 1)
np.random.randint(low, high=None, size=None): 균일 분포의 정수 난수

high 미입력 시 0과 low사이 숫자를,
high 입력 시 low와 high 사이의 숫자를 출력
size는 난수의 숫자

In [ ]:

np.random.seed(0)

np.random.rand(10)
np.random.rand(3, 5) #`randn` 명령은 기댓값이 0이고 표준편차가 1인 가우시안 표준 정규 분포를 따르는 난수를 생성한다. 인수 사용법은 `rand` 명령과 같다.

np.random.randn(10)
np.random.randn(3, 5)

np.random.randint(10, size=10)
np.random.randint(10, 20, size=10)
np.random.randint(10, 20, size=(3, 5))

In [ ]:

균일간격 점 생성
np.linspace(시작값, 끝값, 256)

np.linspace(-np.pi, np.pi, 256) 시작점과 끝점을 균일 간격으로 나눈 점들을 생성

In [ ]:

np.linspace(-np.pi, np.pi, 256)

In [ ]:

난수 데이터 순서변경과 샘플링
1.np.random.shuffle(np.arange(10)
2.np.random.choice(5, 10, p=[0.1, 0, 0.3, 0.6, 0])

np.random.shuffle(np.arange(10): 데이터 순서 변경
np.random.choice(a, size=None, replace=True, p=None): 데이터 샘플링

a : 배열이면 원래의 데이터, 정수이면 arange(a) 명령으로 데이터 생성
size : 정수. 샘플 숫자
replace : 불리언. True이면 한번 선택한 데이터를 다시 선택 가능
p : 배열. 각 데이터가 선택될 수 있는 확률

In [ ]:

x = np.arange(10)
np.random.shuffle(x)

np.random.choice(5, 5, replace=False)  # shuffle 명령과 같다.
np.random.choice(5, 3, replace=False)  # 3개만 선택
np.random.choice(5, 10)  # 반복해서 10개 선택
np.random.choice(5, 10, p=[0.1, 0, 0.3, 0.6, 0])  # 선택 확률을 다르게 해서 10개 선택

In [ ]:

난수가 정수일 때 데이터 카운팅
1.np.unique([11, 11, 2, 2, 34, 34], return_counts=True)
2.np.bincount([1, 1, 2, 2, 2, 3], minlength=6)

np.unique([11, 11, 2, 2, 34, 34], return_counts=True)

중복 아닌 값 리스트 출력
return_counts=True 설정 시 각 값을 가진 데이터 갯수 출력

np.bincount([1, 1, 2, 2, 2, 3], minlength=6)

최소 여섯번 째 인덱스까지의 데이터를 카운트, 없는 데이터 인덱스에 대해 0으로 표시

In [ ]:

np.unique([11, 11, 2, 2, 34, 34])
a = np.array(['a', 'b', 'b', 'c', 'a'])
index, count = np.unique(a, return_counts=True)
index
count

np.bincount([1, 1, 2, 2, 2, 3], minlength=6)

In [ ]:

Power BI에 Jupyter Notebook에 삽입하기
<from IPython.display import IFrame
powerBiEmbed = 'power bi embed code'
IFrame(powerBiEmbed, width=int, height=int)>

from IPython.display import IFrame
powerBiEmbed = 'https://app.powerbi.com/view?... IFrame(powerBiEmbed, width=800, height=600)

In [ ]:

from IPython.display import IFrame
powerBiEmbed = 'https://app.powerbi.com/view?r=eyJrIjoiY2MzZWYzOTMtOTEwMi00OWVkLTk4ZTQtMjQxZTk2ODdlYjg1IiwidCI6ImIwOTg4YzEzLWUxMzQtNGI3OC1hZGY3LTVkMWU1YjI0MTAxOCIsImMiOjEwfQ%3D%3D'

IFrame(powerBiEmbed, width=800, height=600)

In [ ]:

Azure Jupyter Notebook에 확장기능 설치¶

In [ ]:

!pip install jupyter_contrib_nbextensions
!jupyter contrib nbextension install --user
!/usr/local/bin/jupyter nbextension enable codefolding/main
def f():
    print('hello there')

In [ ]:

pd.concat(
objs, # Series, DataFrame, Panel object
axis=0, # 0: 위+아래로 합치기, 1: 왼쪽+오른쪽으로 합치기
join='outer', # 'outer': 합집합(union), 'inner': 교집합(intersection)
join_axes=None, # axis=1 일 경우 특정 DataFrame의 index를 그대로 이용하려면 입력
ignore_index=False, # False: 기존 index 유지, True: 기존 index 무시
keys=None, # 인덱스 기입, 계층적 index 사용하려면 keys 튜플 입력
e.g. pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
levels=None,
names=None, # index의 이름 부여하려면 names 튜플 입력
verify_integrity=False, # True: index 중복 확인
copy=True) # 복사

In [ ]:

pd.merge(
left, right, # merge할 DataFrame 객체 이름
how='inner', # left, rigth, inner (default), outer
on=None, # merge의 기준이 되는 Key 변수
left_on=None, # 왼쪽 DataFrame의 변수를 Key로 사용
right_on=None, # 오른쪽 DataFrame의 변수를 Key로 사용
left_index=False, # 만약 True 라면, 왼쪽 DataFrame의 index를 merge Key로 사용
right_index=False, # 만약 True 라면, 오른쪽 DataFrame의 index를 merge Key로 사용
sort=True, # merge 된 후의 DataFrame을 join Key 기준으로 정렬
suffixes=('_x', '_y'), # 중복되는 변수 이름에 대해 접두사 부여 (defaults to '_x', '_y'
copy=True, # merge할 DataFrame을 복사
indicator=False) # 병합된 이후의 DataFrame에 left_only, right_only, both 등의 출처를 알 수 있는 부가 정보 변수 추가

출처: https://rfriend.tistory.com/256 [R, Python 분석과 프로그래밍 (by R Friend)]

In [ ]:

전체 설문대상자 중 응답자 비율
data.isnull().sum()/data.isnull().count()*100

In [ ]:

train_df.isnull().sum()/train_df.isnull().count()*100

In [ ]:

접두어,접미어 붙이기
data.addprefix('X') , data.add_suffix('_Y')

In [ ]:

df.add_prefix('X_')

In [ ]:

df.add_suffix('_Y')

In [ ]:

행 역순정렬
data.loc[: : -1].reset_index(drop=True)

In [ ]:

drinks.loc[::-1].reset_index(drop=True).head()

In [ ]:

열 역순정렬
data.loc[:, : : -1]

In [ ]:

drinks.loc[::-1].reset_index(drop=True).head()

In [ ]:

숫자로 된 문자형 데이터열을 숫자형 열로 변경
data.astype({'col_name1':'float', 'col_name2':'float'})

In [ ]:

df.astype({'col_one':'float', 'col_two':'float'})

In [ ]:

'-' 등의 문자열포함 데이터열을 숫자형 열로 변경
Series적용: pd.numeric(data.col_name, error=coerce).fillna('0')
DataFrame적용: df.apply(pd.to_numeric, errors='coerce').fillna(0)

In [ ]:

pd.numeric(df.col_three, error=coerce).fillna('0')
# error=coerce: 문자열포함 데이터열에 대해 pd.numeric 실행 시 발생하는 error 필드를 NaN 값 처리
# fillna(0)로 NaN 값을 숫자로 변경

df = df.apply(pd.to_numeric, errors='coerce').fillna(0)
df

In [ ]:

Plotly¶

플롯에 마진없애기¶

출처: https://stackoverflow.com/questions/50466450/how-to-remove-plot-margin-in-plotly-tools-make-subplots

In [ ]:

fig['layout'].update(margin=dict(l=0,r=0,b=0,t=0))

dticks: tick 간격¶

출처: https://plot.ly/python/tick-formatting/

In [ ]:

layout = go.Layout(
    xaxis = go.layout.XAxis(
        tickmode = 'linear',
        tick0 = 0.5,
        dtick = 0.75
    )
)

X축 공유 플롯에서 플롯별로 높이를 다르게 지정하기¶

출처: https://stackoverflow.com/questions/52856836/plot-ly-different-height-for-subplots-with-shared-x-axes

In [ ]:

fig = tools.make_subplots(rows=3, cols=1,
                          shared_xaxes=True,
                          vertical_spacing=0.1,
                          subplot_titles=('subtitle 1', 'subtitle 2', 'subtitle 3'),
                          row_width=[0.2, 0.4, 0.2]
                         )

[파이썬/snippet] pandas & numpy snippet모음

데이터 불러오기, 데이터프레임 편집¶

라이브러리 import¶

Pandas 출력을 옆으로 출력¶

기본 글꼴 변경¶

csv 필드값이 매우 큰 경우 에러 해결¶

학생정보 엑셀파일 불러오기¶

이미지 불러오기¶

데이터프레임 편집¶

데이터프레임 사이즈 축소¶

복수파일의 데이터셋에서 단일 데이터프레임 생성(행방향): glob 패키지 사용¶

복수파일의 데이터셋에서 단일 데이터프레임 생성(열방향): glob 패키지 사용¶

클립보드로 데이터프레임 만들기¶

데이터프레임을 두 개의 랜덤 데이터셋으로 나누기¶

데이터프레임 특정열 내 값유무에 따른 필터링¶

갯수 많은 데이터순으로 필터링¶

칼럼명의 문자열을 나눠 multiple 칼럼 생성¶

인코딩 확인¶

과학적 표기법¶

단어수세기¶

항목의 없는 원소 추출 (연산자 not 사용)¶

apply 변환¶

주제별 정리¶

null값 (누락값) 처리¶

특정 열에 null 있는지 여부만 확인 <.hasnans> / 특정열의 각 원소 null 여부 boolean 반환 <np.isnan(Series)>¶

null값 제외 <Series[Series != 'nan'] Series[np.logical_not(Series == 'nan')] Series[-(Series == 'nan')] Series[~(Series == 'nan')] Series[(Series == 'nan') == False] >

null값 있는 행 또는 열 삭제 <df.dropna(axis=0, how='any', thresh=None(임계값,int), subset=None(array-like), inplace=False)> Series.dropna 의 경우, Series 전체 아닌 null 값만 삭제

null값 개수 확인 <Series[Series != 'nan'].value_counts()>

null값 위치를 고정하고 sorting <.sort_values(by='a', na_position='last')>

null 값 포함해 연산(합계) <df.sum(skipna=False)>

null값 다른 값으로 변경 <.fillna('대체할값')>

각 열 내 null값을 가진 셀의 합들의 합 <stu_info.isnull().sum().sum()>

null값 다른 값으로 변경 (.pivot_table에서) <.pivot_table(values='Series', index='Series', columns='Series', fill_value = '대체할값')>

null값 보간처리 <.interpolate()>¶

특정열의 각 원소 null 여부 boolean 반환 <np.isnan(Series) or Series.apply(np.isnan)> (특정 열 null 값 있는지 여부만 확인할 때는 Series.hasnans)

null값이 아닌 값의 칼럼별 개수 <.count()>¶

null값의 칼럼별 개수 <.isnull().sum()>¶

null 아닌 값의 칼럼별 개수 <.notnull().sum()>¶

null값의 칼럼별 개수의 합계 <.isnull().sum().sum()>¶

value_counts메소드에서 null값 빈도 표시 <.value_counts(dropna=False)>¶

특정열 null값 삭제 (index속성을 사용) <Series.index[Series.apply(np.isnan)]>¶

null값을 다른 열의 null값으로 대체 <df.loc[df[A].isnull(),A] = df.loc[df[A].isnull(),B]>

A = null값 있는 열, B = null 값 대체할 값 가진 열¶

중복값 처리¶

중복값 여부 확인 <.duplicated(subset=None(column label or sequence of labels, keep='first')>¶

중복값의 칼럼별 개수 <.duplicated().sum()>¶

중복된 열의 정보 삭제 <series.drop_duplicates()>¶

문자열 처리¶

한 셀에 ','로 구분 된 문자열 내용 분리하기¶

문자열 나누기 <.str.split('나누는기준문자')>¶

문자열 나눈 후 가져오기 <.str.split.get(가져올 문자열 번호) (or str[가져올 문자열 번호]>¶

문자열 가져오기 <.str.get(가져올 문자열 번호) (or str[가져올 문자열 번호]>¶

문자열 나눈 후 데이터프레임 생성 <.str.split('나누는기준문자', expand=True, n=리스트에서 분리해 낼 원소수)>

문자열 대체 <.str.replace("현재str", "대체str")>¶

열내용을 대문자로 변환Series.apply(str.upper)

리스트/Dict/Series/정규식/실수 대체 <.replace(to_replace=['남', '여'], value=1)>

문자열 통합 <.str.cat(sep='연결구분자')>¶

시리즈를 리스트로 변환 <.tolist>¶

특정 문자열 포함한 행에 대한 데이터프레임 추출 <.loc[Series.str.contains('str')]>

DataFrame 내 자료형별 열 개수 구하기 <.get_dtype_counts()>¶

특정 자료형 열추출 <.select_dtypes(include=['자료형'])>¶

index값 처리¶

내부 구성 값 크기별 정렬 <.nlargest(숫자), .nsmallest(숫자)>

임계치 지정 해 값 변환 <.clip, lower=숫자 , upper=숫자>

실수 값을 카테고리 값으로 변환 <.cut(data, bins, labels=[]) / .qcut(data, 범주수, labels=[]>

특정 컬럼과 특정 인덱스 동시 삭제 <.drop(columns=df.filter().columns, index=[]>

행 위치 이동 <.shift(이동할행[인덱스]숫자, , freq="시간단위인수")>

특정기준으로 몇 번째 값인지 보여 줌 <.rank, method=기준이 되는 연산, pct=True 퍼센트>

.groupby연산결과가 Series일 때 DataFrame으로 추출 <.groupby(Series, as_index=False)[값Series].sum()>

MultiIndex DataFrame에서 index레벨 변경 <.swaplevel(i=-2, j=-1, copy=True)>

MultiIndex DataFrame에서 index 정렬 <.sort_index(level=0)>

특정 값,위치 검색¶

특정열에서 특정문자 포함한 값 찾기 <Series.str.contains('찾는문자')>

특정열에서 특정값 찾기 <Series.isin(['특정값1, 특정값n'])>

특정 열 내에 특정 값 있는 경우, 해당 행 전체를 반환 <.query('검색대상열 == 검색대상값')>

특정 행열 위치 값 변경 <.set_value('행명', '열명', '대체할값')>

특정 값 행렬위치로 검색 <.lookup([행레이블1,행레이블n],[열레이블1, 열레이블n])>¶

특정 행렬위치 레이블로 열 또는 데이터프레임 반환 <.xs(행[열]레이블, axis=0, level=None[반환수준], drop_level=True[False:동일수준반환])>

멀티인덱스 검색으로 열 또는 데이터프레임 반환 <.xs((행[열]레이블 1, 행[열]레이블 n), axis=0)>

특정범위 내 특정 값 있는 경우, 불리언값 반환 <.between(시작값,종료값)>

null값 제외
<Series[Series != 'nan']
Series[np.logical_not(Series == 'nan')]
Series[-(Series == 'nan')]
Series[~(Series == 'nan')]
Series[(Series == 'nan') == False] >

null값 있는 행 또는 열 삭제
<df.dropna(axis=0, how='any', thresh=None(임계값,int), subset=None(array-like), inplace=False)>
Series.dropna 의 경우, Series 전체 아닌 null 값만 삭제

null값 개수 확인
<Series[Series != 'nan'].value_counts()>

null값 위치를 고정하고 sorting
<.sort_values(by='a', na_position='last')>

null 값 포함해 연산(합계)
<df.sum(skipna=False)>

null값 다른 값으로 변경
<.fillna('대체할값')>

각 열 내 null값을 가진 셀의 합들의 합
<stu_info.isnull().sum().sum()>

null값 다른 값으로 변경 (.pivot_table에서)
<.pivot_table(values='Series', index='Series', columns='Series', fill_value = '대체할값')>

특정열의 각 원소 null 여부 boolean 반환
<np.isnan(Series) or Series.apply(np.isnan)> (특정 열 null 값 있는지 여부만 확인할 때는 Series.hasnans)

null값을 다른 열의 null값으로 대체
<df.loc[df[A].isnull(),A] = df.loc[df[A].isnull(),B]>

문자열 나눈 후 데이터프레임 생성
<.str.split('나누는기준문자', expand=True, n=리스트에서 분리해 낼 원소수)>

열내용을 대문자로 변환
Series.apply(str.upper)

리스트/Dict/Series/정규식/실수 대체
<.replace(to_replace=['남', '여'], value=1)>

특정 문자열 포함한 행에 대한 데이터프레임 추출
<.loc[Series.str.contains('str')]>

내부 구성 값 크기별 정렬
<.nlargest(숫자), .nsmallest(숫자)>

임계치 지정 해 값 변환
<.clip, lower=숫자 , upper=숫자>

실수 값을 카테고리 값으로 변환
<.cut(data, bins, labels=[]) / .qcut(data, 범주수, labels=[]>

특정 컬럼과 특정 인덱스 동시 삭제
<.drop(columns=df.filter().columns, index=[]>

행 위치 이동
<.shift(이동할행[인덱스]숫자, , freq="시간단위인수")>

특정기준으로 몇 번째 값인지 보여 줌
<.rank, method=기준이 되는 연산, pct=True 퍼센트>

.groupby연산결과가 Series일 때 DataFrame으로 추출
<.groupby(Series, as_index=False)[값Series].sum()>

MultiIndex DataFrame에서 index레벨 변경
<.swaplevel(i=-2, j=-1, copy=True)>

MultiIndex DataFrame에서 index 정렬
<.sort_index(level=0)>

특정열에서 특정문자 포함한 값 찾기
<Series.str.contains('찾는문자')>

특정열에서 특정값 찾기
<Series.isin(['특정값1, 특정값n'])>

특정 열 내에 특정 값 있는 경우, 해당 행 전체를 반환
<.query('검색대상열 == 검색대상값')>

특정 행열 위치 값 변경
<.set_value('행명', '열명', '대체할값')>

특정 행렬위치 레이블로 열 또는 데이터프레임 반환
<.xs(행[열]레이블, axis=0, level=None[반환수준], drop_level=True[False:동일수준반환])>

멀티인덱스 검색으로 열 또는 데이터프레임 반환
<.xs((행[열]레이블 1, 행[열]레이블 n), axis=0)>

특정범위 내 특정 값 있는 경우, 불리언값 반환
<.between(시작값,종료값)>

특정열을 범주화한 열 생성하기
<df[생성열] = pd.Categorical(np.where(조건, True일때범주명, False일때범주명)>

Groupby 에 의해 split 된 정보 추출
<df.groupby(Series)>

grouped_Series의 group별 key값, 특정 group의 index 및 데이터프레임 추출
<.groups.keys() / .groups["특정그룹의key"] / .get_group("특정그룹의key") / .first()>

groupby 객체 연산을 pivot형태로 변경
<df.groupby("Team").연산().unstack()>

grouped_Series의 기본 연산
<df.groupby("Team").연산메소드(level=연산의 기준이 되는 Series 번호)>

grouped_Series의 index 레벨 변경
<grouped_Series.연산메소드().swaplevel((i=-2, j=-1, copy=True): 멀티인덱스 i와 j 간 레벨변경

grouped_Series 연산을 돕는 Multiindex정리방법 1:
droplevel(level=0): 대상Index레벨낮춤

grouped_Series 연산을 돕는 Multiindex정리방법 2:
ravel(): MultiIndex를 단순배열로 변경

grouped_Series의 index 기준으로 정렬
<grouped_Series.연산메소드().sort_index(level=정렬의 기준이 되는 Series 번호)>

날짜 배열 빈도 기준 확인
<날짜객체.freq>

주기 나타내는 인덱스로 변경(freq 변경)
<pd.to_period('D')>

주기 인덱스 생성
<pd.period_range('2018-01', periods=13, freq='M')>

시간 간격 인덱스 생성
<pd.timedelta_range(0, periods=10, freq='H')>

다운샘플링 시 sum/mean등의 그룹연산으로 대표값 구함
<.resample(단위시간).sum()>

업-샘플링 시 ffill/bfill로 대표값 구함
<.resample(단위시간).ffill [or bfill] ()>

index변경
df.rename({'기존index문자':'대체할index문자'})

범주형 인덱스 추가
<s.index.add_categories(새카테고리,inplace=False)>

원하는 위치에 칼럼, 값 생성
<index.insert(loc, item) / df.insert(loc, column, value, allow_duplicates = False)>

균일간격 점 생성
np.linspace(시작값, 끝값, 256)

난수 데이터 순서변경과 샘플링
1.np.random.shuffle(np.arange(10)
2.np.random.choice(5, 10, p=[0.1, 0, 0.3, 0.6, 0])

난수가 정수일 때 데이터 카운팅
1.np.unique([11, 11, 2, 2, 34, 34], return_counts=True)
2.np.bincount([1, 1, 2, 2, 2, 3], minlength=6)

Power BI에 Jupyter Notebook에 삽입하기
<from IPython.display import IFrame
powerBiEmbed = 'power bi embed code'
IFrame(powerBiEmbed, width=int, height=int)>

전체 설문대상자 중 응답자 비율
data.isnull().sum()/data.isnull().count()*100

접두어,접미어 붙이기
data.addprefix('X') , data.add_suffix('_Y')

행 역순정렬
data.loc[: : -1].reset_index(drop=True)

열 역순정렬
data.loc[:, : : -1]

숫자로 된 문자형 데이터열을 숫자형 열로 변경
data.astype({'col_name1':'float', 'col_name2':'float'})

'-' 등의 문자열포함 데이터열을 숫자형 열로 변경
Series적용: pd.numeric(data.col_name, error=coerce).fillna('0')
DataFrame적용: df.apply(pd.to_numeric, errors='coerce').fillna(0)