[kaggle][성인 인구조사 소득예측] 🐱💻 1. 첫 캐글 도전
# 기본
import os
# 분석 라이브러리
import pandas as pd
import numpy as np
#시각호 라이브러리
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
데이터 불러오기¶
# 파일 목록
os.listdir()
['.ipynb_checkpoints',
'adult_data.csv',
'adult_names.csv',
'adult_test.csv',
'[Adult] 1. Basic EDA.ipynb']
# 데이터 읽어오기
train = pd.read_csv('adult_data.csv')
test = pd.read_csv('adult_test.csv')
train.head(2)
39 | State-gov | 77516 | Bachelors | 13 | Never-married | Adm-clerical | Not-in-family | White | Male | 2174 | 0 | 40 | United-States | <=50K | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 50 | Self-emp-not-inc | 83311 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 13 | United-States | <=50K |
1 | 38 | Private | 215646 | HS-grad | 9 | Divorced | Handlers-cleaners | Not-in-family | White | Male | 0 | 0 | 40 | United-States | <=50K |
test.head(2)
25 | Private | 226802 | 11th | 7 | Never-married | Machine-op-inspct | Own-child | Black | Male | 0 | 0.1 | 40 | United-States | <=50K. | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 38 | Private | 89814 | HS-grad | 9 | Married-civ-spouse | Farming-fishing | Husband | White | Male | 0 | 0 | 50 | United-States | <=50K. |
1 | 28 | Local-gov | 336951 | Assoc-acdm | 12 | Married-civ-spouse | Protective-serv | Husband | White | Male | 0 | 0 | 40 | United-States | >50K. |
✨ 필요한 작업
: train, test 열 지정해주기!!
pd.read_table('adult_names.txt').tail(15)
| This data was extracted from the census bureau database found at | |
---|---|
91 | >50K, <=50K. |
92 | age: continuous. |
93 | workclass: Private, Self-emp-not-inc, Self-emp... |
94 | fnlwgt: continuous. |
95 | education: Bachelors, Some-college, 11th, HS-g... |
96 | education-num: continuous. |
97 | marital-status: Married-civ-spouse, Divorced, ... |
98 | occupation: Tech-support, Craft-repair, Other-... |
99 | relationship: Wife, Own-child, Husband, Not-in... |
100 | race: White, Asian-Pac-Islander, Amer-Indian-E... |
101 | sex: Female, Male. |
102 | capital-gain: continuous. |
103 | capital-loss: continuous. |
104 | hours-per-week: continuous. |
105 | native-country: United-States, Cambodia, Engla... |
✨ 필요한 작업
: 확인 됨 열이름으로 train과 test에 넣어주기
'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation' ,'relationship', 'race' ,'sex' ,'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income'
age : 나이
workclass : 고용 형태
fnlwgt : 사람 대표성을 나타내는 가중치 (final weight의 약자)
education : 교육 수준
education_num : 교육 수준 수치
marital_status: 결혼 상태
occupation : 업종
relationship : 가족 관계
race : 인종
sex : 성별
capital_gain : 양도 소득
capital_loss : 양도 손실
hours_per_week : 주당 근무 시간
native_country : 국적
income : 수익 (예측해야 하는 값)
데이터 살펴보기 전 기본사항 처리¶
train.shape
(32560, 15)
test.shape
(16280, 15)
train.columns = ['age', 'workclass', 'fnlwgt', 'education',
'education-num', 'marital-status', 'occupation',
'relationship', 'race' ,'sex' ,'capital-gain',
'capital-loss', 'hours-per-week', 'native-country','income']
train.sample(5)
age | workclass | fnlwgt | education | education-num | marital-status | occupation | relationship | race | sex | capital-gain | capital-loss | hours-per-week | native-country | income | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
17118 | 44 | Private | 462838 | HS-grad | 9 | Divorced | Handlers-cleaners | Not-in-family | White | Female | 0 | 0 | 48 | United-States | <=50K |
20055 | 38 | Self-emp-not-inc | 151322 | HS-grad | 9 | Separated | Craft-repair | Not-in-family | White | Male | 0 | 0 | 40 | United-States | <=50K |
14999 | 30 | Private | 167476 | Some-college | 10 | Married-civ-spouse | Transport-moving | Husband | White | Male | 0 | 0 | 60 | United-States | <=50K |
24602 | 35 | Self-emp-not-inc | 112271 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 50 | United-States | >50K |
30203 | 27 | Private | 243569 | 11th | 7 | Married-civ-spouse | Handlers-cleaners | Husband | White | Male | 3942 | 0 | 40 | United-States | <=50K |
test.columns = ['age', 'workclass', 'fnlwgt', 'education',
'education-num', 'marital-status', 'occupation',
'relationship', 'race' ,'sex' ,'capital-gain',
'capital-loss', 'hours-per-week', 'native-country','income']
test.sample(5)
age | workclass | fnlwgt | education | education-num | marital-status | occupation | relationship | race | sex | capital-gain | capital-loss | hours-per-week | native-country | income | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
6610 | 65 | ? | 76131 | HS-grad | 9 | Never-married | ? | Unmarried | Asian-Pac-Islander | Female | 0 | 0 | 40 | United-States | <=50K. |
9265 | 25 | Private | 66935 | Bachelors | 13 | Never-married | Exec-managerial | Own-child | White | Male | 0 | 0 | 40 | United-States | <=50K. |
5802 | 47 | Self-emp-not-inc | 294671 | HS-grad | 9 | Married-civ-spouse | Sales | Husband | White | Male | 0 | 0 | 55 | United-States | >50K. |
4369 | 46 | Local-gov | 88564 | Some-college | 10 | Married-civ-spouse | Handlers-cleaners | Husband | White | Male | 0 | 0 | 40 | United-States | <=50K. |
2282 | 25 | Private | 150132 | HS-grad | 9 | Never-married | Adm-clerical | Not-in-family | White | Female | 0 | 0 | 40 | United-States | <=50K. |
데이터 살펴보기¶
- train 파일 기준으로!
기본적인 사항¶
train.shape
(32560, 15)
train.describe()
age | fnlwgt | education-num | capital-gain | capital-loss | hours-per-week | |
---|---|---|---|---|---|---|
count | 32560.000000 | 3.256000e+04 | 32560.000000 | 32560.000000 | 32560.000000 | 32560.000000 |
mean | 38.581634 | 1.897818e+05 | 10.080590 | 1077.615172 | 87.306511 | 40.437469 |
std | 13.640642 | 1.055498e+05 | 2.572709 | 7385.402999 | 402.966116 | 12.347618 |
min | 17.000000 | 1.228500e+04 | 1.000000 | 0.000000 | 0.000000 | 1.000000 |
25% | 28.000000 | 1.178315e+05 | 9.000000 | 0.000000 | 0.000000 | 40.000000 |
50% | 37.000000 | 1.783630e+05 | 10.000000 | 0.000000 | 0.000000 | 40.000000 |
75% | 48.000000 | 2.370545e+05 | 12.000000 | 0.000000 | 0.000000 | 45.000000 |
max | 90.000000 | 1.484705e+06 | 16.000000 | 99999.000000 | 4356.000000 | 99.000000 |
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32560 entries, 0 to 32559
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 age 32560 non-null int64
1 workclass 32560 non-null object
2 fnlwgt 32560 non-null int64
3 education 32560 non-null object
4 education-num 32560 non-null int64
5 marital-status 32560 non-null object
6 occupation 32560 non-null object
7 relationship 32560 non-null object
8 race 32560 non-null object
9 sex 32560 non-null object
10 capital-gain 32560 non-null int64
11 capital-loss 32560 non-null int64
12 hours-per-week 32560 non-null int64
13 native-country 32560 non-null object
14 income 32560 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB
train.isnull().sum()
age 0
workclass 0
fnlwgt 0
education 0
education-num 0
marital-status 0
occupation 0
relationship 0
race 0
sex 0
capital-gain 0
capital-loss 0
hours-per-week 0
native-country 0
income 0
dtype: int64
#시각화로 결측값 확인하기
import missingno as msno
msno.matrix(train)
<AxesSubplot:>
==> 결측값이 없다고 나옴. 그러나 간혹 결측값을 '-'나 '?'로 처리하는 경우가 있어서 문자형인 경우 살펴볼 필요있음!
기술통계¶
# 수치형
train.describe()
age | fnlwgt | education-num | capital-gain | capital-loss | hours-per-week | |
---|---|---|---|---|---|---|
count | 32560.000000 | 3.256000e+04 | 32560.000000 | 32560.000000 | 32560.000000 | 32560.000000 |
mean | 38.581634 | 1.897818e+05 | 10.080590 | 1077.615172 | 87.306511 | 40.437469 |
std | 13.640642 | 1.055498e+05 | 2.572709 | 7385.402999 | 402.966116 | 12.347618 |
min | 17.000000 | 1.228500e+04 | 1.000000 | 0.000000 | 0.000000 | 1.000000 |
25% | 28.000000 | 1.178315e+05 | 9.000000 | 0.000000 | 0.000000 | 40.000000 |
50% | 37.000000 | 1.783630e+05 | 10.000000 | 0.000000 | 0.000000 | 40.000000 |
75% | 48.000000 | 2.370545e+05 | 12.000000 | 0.000000 | 0.000000 | 45.000000 |
max | 90.000000 | 1.484705e+06 | 16.000000 | 99999.000000 | 4356.000000 | 99.000000 |
==> 수치형의 경우 capital-gain과 capital-loss가 75%까지 0인 것을 보아 한번 전체적으로 살펴봐야함!
# 문자형
train.describe(include="O")
workclass | education | marital-status | occupation | relationship | race | sex | native-country | income | |
---|---|---|---|---|---|---|---|---|---|
count | 32560 | 32560 | 32560 | 32560 | 32560 | 32560 | 32560 | 32560 | 32560 |
unique | 9 | 16 | 7 | 15 | 6 | 5 | 2 | 42 | 2 |
top | Private | HS-grad | Married-civ-spouse | Prof-specialty | Husband | White | Male | United-States | <=50K |
freq | 22696 | 10501 | 14976 | 4140 | 13193 | 27815 | 21789 | 29169 | 24719 |
for col in train.columns:
if train[col].dtype == 'object':
categories = train[col].unique()
print(f'[{col}]-- ({len(categories)}개)')
print('\n'.join(categories))
print()
[workclass]-- (9개)
Self-emp-not-inc
Private
State-gov
Federal-gov
Local-gov
?
Self-emp-inc
Without-pay
Never-worked
[education]-- (16개)
Bachelors
HS-grad
11th
Masters
9th
Some-college
Assoc-acdm
Assoc-voc
7th-8th
Doctorate
Prof-school
5th-6th
10th
1st-4th
Preschool
12th
[marital-status]-- (7개)
Married-civ-spouse
Divorced
Married-spouse-absent
Never-married
Separated
Married-AF-spouse
Widowed
[occupation]-- (15개)
Exec-managerial
Handlers-cleaners
Prof-specialty
Other-service
Adm-clerical
Sales
Craft-repair
Transport-moving
Farming-fishing
Machine-op-inspct
Tech-support
?
Protective-serv
Armed-Forces
Priv-house-serv
[relationship]-- (6개)
Husband
Not-in-family
Wife
Own-child
Unmarried
Other-relative
[race]-- (5개)
White
Black
Asian-Pac-Islander
Amer-Indian-Eskimo
Other
[sex]-- (2개)
Male
Female
[native-country]-- (42개)
United-States
Cuba
Jamaica
India
?
Mexico
South
Puerto-Rico
Honduras
England
Canada
Germany
Iran
Philippines
Italy
Poland
Columbia
Cambodia
Thailand
Ecuador
Laos
Taiwan
Haiti
Portugal
Dominican-Republic
El-Salvador
France
Guatemala
China
Japan
Yugoslavia
Peru
Outlying-US(Guam-USVI-etc)
Scotland
Trinadad&Tobago
Greece
Nicaragua
Vietnam
Hong
Ireland
Hungary
Holand-Netherlands
[income]-- (2개)
<=50K
>50K
위 방식으로도 unique값을 볼 수 있지만, 각 열 값들의 unique 개수가 많으니 --> 시각화그래프를 통해 보다 잘 파악할 수 있음
#그래프
sns.set_style("whitegrid")
# 문자형인 열과 unique값 확인
for col in train:
if train[col].dtype == "object":
cat= train[col].unique()
print(f'[{col}]--({len(cat)}개)')
[workclass]--(9개)
[education]--(16개)
[marital-status]--(7개)
[occupation]--(15개)
[relationship]--(6개)
[race]--(5개)
[sex]--(2개)
[native-country]--(42개)
[income]--(2개)
sex value 확인¶
train.sex.value_counts()
Male 21789
Female 10771
Name: sex, dtype: int64
# 시각화 그래프로 확인하기
# 컨버스 만들기
fig, ax = plt.subplots(1,1,figsize=(8,6))
# value별 개수세기
sns.countplot(data=train, x="sex")
plt.show()
# 시각화 전 기본 세팅
#한글 깨짐
plt.rc("font", family="Malgun Gothic")
성별 / 수입 확인 시각화¶
fig, axes = plt.subplots(1, 2, figsize=(13, 7), sharey=True)
sns.countplot(data=train, x='sex', ax=axes[0], palette="Set2", edgecolor='black')
sns.countplot(data=train, x='income', ax=axes[1], color='gray', edgecolor='black')
# Margin & Label 조정
for ax in axes :
ax.margins(0.12, 0.15)
ax.xaxis.label.set_size(12)
ax.xaxis.label.set_weight('bold')
# figure title
plt.suptitle('성별/ 수입 분포',
fontsize=17,
fontweight='bold',
x=0.05, y=1.06,
ha='left' # horizontal alignment
)
plt.tight_layout()
plt.show()
인종/ 국적 확인 시각화¶
fig, axes = plt.subplots(1, 2, figsize=(20, 7), sharey=True)
sns.countplot(data=train, x='race', ax=axes[0], color="gray", edgecolor='black')
sns.countplot(data=train, x='native-country' , ax=axes[1], color='gray', edgecolor='black')
# Margin & Label 조정
for ax in axes :
ax.margins(0.12, 0.15)
ax.xaxis.label.set_size(12)
ax.xaxis.label.set_weight('bold')
plt.setp(axes[1].xaxis.get_majorticklabels(), rotation=90 )
# figure title
plt.suptitle('인종/ 국적 분포',
fontsize=17,
fontweight='bold',
x=0.05, y=1.06,
ha='left' # horizontal alignment
)
plt.tight_layout()
plt.show()
가족관계 / 결혼 상태 확인 시각화¶
fig, axes = plt.subplots(1, 2, figsize=(20, 7), sharey=True)
sns.countplot(data=train, x='relationship', ax=axes[0], palette="Set2", edgecolor='black')
sns.countplot(data=train, x='marital-status', ax=axes[1], palette='Set2', edgecolor='black')
# Margin & Label 조정
for ax in axes :
ax.margins(0.12, 0.15)
ax.xaxis.label.set_size(12)
ax.xaxis.label.set_weight('bold')
plt.setp(axes[1].xaxis.get_majorticklabels(), rotation=50 )
# figure title
plt.suptitle('가족관계 / 결혼상태 분포',
fontsize=17,
fontweight='bold',
x=0.05, y=1.06,
ha='left' # horizontal alignment
)
plt.tight_layout()
plt.show()
고용형태 / 직업/ 학력 확인 시각화¶
fig, axes = plt.subplots(1, 3, figsize=(20, 7), sharey=True)
sns.countplot(data=train, x='workclass', ax=axes[0], palette="Set2", edgecolor='black')
sns.countplot(data=train, x='occupation', ax=axes[1], palette='Set2', edgecolor='black')
sns.countplot(data=train, x='education', ax=axes[2], palette='Set2', edgecolor='black')
# Margin & Label 조정
for idx, ax in enumerate(axes) :
ax.margins(0.12, 0.15)
ax.xaxis.label.set_size(12)
ax.xaxis.label.set_weight('bold')
plt.setp(axes[idx].xaxis.get_majorticklabels(), rotation=90 )
# figure title
plt.suptitle('고용형태 / 직업 / 학력 분포',
fontsize=17,
fontweight='bold',
x=0.05, y=1.06,
ha='left' # horizontal alignment
)
plt.tight_layout()
plt.show()
나이 분포 확인 시각화¶
fig, ax = plt.subplots(1, 1, figsize=(10, 5))
ax.hist(train['age'], bins=10)
ax.set_ylim(0, 6000)
ax.set_title('나이 분포')
plt.show()
전처리 하기¶
- 나이, 결혼 여부, 직종 등 총 14개의 feature를 통해
각 사람의 소득을 예측
하는 게 목표!
==> feature와 target을 나누어 모델에 넣어줘야 하니, income feature를 먼저 따로 빼놓는다!
income 분리 후 처리¶
-->처리의 형식을 1과 0으로 리벨링 인코딩해준다.
--> T/F를 이용해 변환한다.
train.income !='<=50k'
0 True
1 True
2 True
3 True
4 True
...
32555 True
32556 True
32557 True
32558 True
32559 True
Name: income, Length: 32560, dtype: bool
(train.income !='<=50k').astype(int)
0 1
1 1
2 1
3 1
4 1
..
32555 1
32556 1
32557 1
32558 1
32559 1
Name: income, Length: 32560, dtype: int32
target = (train.income !='<=50k').astype(int)
train.drop(['income'], axis=1, inplace=True)
train.head(2)
age | workclass | fnlwgt | education | education-num | marital-status | occupation | relationship | race | sex | capital-gain | capital-loss | hours-per-week | native-country | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 50 | Self-emp-not-inc | 83311 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 13 | United-States |
1 | 38 | Private | 215646 | HS-grad | 9 | Divorced | Handlers-cleaners | Not-in-family | White | Male | 0 | 0 | 40 | United-States |
이후 계획¶
인코딩¶
categorical feature의 경우에는 일반적인 모델에서 바로 사용할 수 없기 때문에 변환(encoding)하기
- Label Encoding
- One-Hot Encoding
- Mean Encoding
과적합 방지를 위한 Test/Validation 분리¶
원래는 데이터의 과적합을 방직하기 위해 test dataset 이전에 validation dataset을 만들어 train dataset으로 훈련한 결과를 확인하기
그리고 이런 validation에도 여러가지 방법이 존재합니다. 특히 캐글에서는 Cross-Validation을 많이 사용함!
- KFold
- Stratified KFold
모델 훈련 및 예측¶
scikit-learn에서 제공하는 대부분의 모델은 train, target을 입력으로 받아 훈련을 진행하기
- Decision Tree
- KNN
- Linear Classification
- Logistic Regression
- Lasso
- Ridge
- Random Forest
- XGBoost
- Perceptron
'😎 프로젝트 만들기 > - EDA(kaggle,etc)' 카테고리의 다른 글
[kaggle][성인 인구조사 소득예측] 🐱💻 3. Raw File trimming (feat. 계속 바뀌는 환경) (0) | 2022.03.15 |
---|---|
[kaggle][성인 인구조사 소득예측] 🐱💻 2. Deep EDA & Feature Engineering (0) | 2022.03.11 |
[kaggle] 🤨시작하기 전 - 성인 인구조사 소득 예측 대회 (0) | 2022.03.07 |
[pandas] 재도전! 수원시 종합병원데이터 전처리-1 (feat경기도데이터드림이 짱) (0) | 2022.01.30 |
[pandas] 수원시 종합병원 데이터 전처리 (현실 세계의 데이터는 오류 투성이....) (0) | 2022.01.29 |