要获得优秀的模型,首先需要清洗数据。这是一篇如何在 Python 中执行数据清洗的分步指南。
数据清洗:从记录集、表或数据库中检测和修正(或删除)受损或不准确记录的过程。它识别出数据中不完善、不准确或不相关的部分,并替换、修改或删除这些脏乱的数据。
![](https://image.jiqizhixin.com/uploads/editor/d56b0b06-c513-434c-95a6-81667ff012ef/640.jpeg)
-
缺失数据;
-
不规则数据(异常值);
-
不必要数据:重复数据(repetitive data)、复制数据(duplicate data)等;
-
不一致数据:大写、地址等;
![](https://image.jiqizhixin.com/uploads/editor/a705eb23-02f7-4173-9f88-5afd245e97e9/640.png)
# import packages import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt import matplotlib.mlab as mlab import matplotlib plt.style.use('ggplot') from matplotlib.pyplot import figure %matplotlib inline matplotlib.rcParams['figure.figsize'] = (12,8) pd.options.mode.chained_assignment = None # read the data df = pd.read_csv('sberbank.csv') # shape and data types of the data print(df.shape) print(df.dtypes) # select numeric columns df_numeric = df.select_dtypes(include=[np.number]) numeric_cols = df_numeric.columns.values print(numeric_cols) # select non numeric columns df_non_numeric = df.select_dtypes(exclude=[np.number]) non_numeric_cols = df_non_numeric.columns.values print(non_numeric_cols)
cols = df.columns[:30] # first 30 columns colours = ['#000099', '#ffff00'] # specify the colours - yellow is missing. blue is not missing. sns.heatmap(df[cols].isnull(), cmap=sns.color_palette(colours))
![](https://image.jiqizhixin.com/uploads/editor/1a6769d3-128d-45bd-8c06-7f3d48618209/640.jpeg)
# if it's a larger dataset and the visualization takes too long can do this. # % of missing. for col in df.columns: pct_missing = np.mean(df[col].isnull()) print('{} - {}%'.format(col, round(pct_missing*100)))
![](https://image.jiqizhixin.com/uploads/editor/cf6aba84-ca6e-48e9-bf27-1ee69e34dcd4/640.png)
# first create missing indicator for features with missing data for col in df.columns: missing = df[col].isnull() num_missing = np.sum(missing) if num_missing > 0: print('created missing indicator for: {}'.format(col)) df['{}_ismissing'.format(col)] = missing # then based on the indicator, plot the histogram of missing values ismissing_cols = [col for col in df.columns if 'ismissing' in col] df['num_missing'] = df[ismissing_cols].sum(axis=1) df['num_missing'].value_counts().reset_index().sort_values(by='index').plot.bar(x='index', y='num_missing')
![](https://image.jiqizhixin.com/uploads/editor/8af74a59-16df-4adc-bda2-c17e9ecbd050/640.png)
# drop rows with a lot of missing values. ind_missing = df[df['num_missing'] > 35].index df_less_missing_rows = df.drop(ind_missing, axis=0)
# hospital_beds_raion has a lot of missing. # If we want to drop. cols_to_drop = ['hospital_beds_raion'] df_less_hos_beds_raion = df.drop(cols_to_drop, axis=1)
# replace missing values with the median. med = df['life_sq'].median() print(med) df['life_sq'] = df['life_sq'].fillna(med)
# impute the missing values and create the missing value indicator variables for each numeric column. df_numeric = df.select_dtypes(include=[np.number]) numeric_cols = df_numeric.columns.values for col in numeric_cols: missing = df[col].isnull() num_missing = np.sum(missing) if num_missing > 0: # only do the imputation for the columns that have missing values. print('imputing missing values for: {}'.format(col)) df['{}_ismissing'.format(col)] = missing med = df[col].median() df[col] = df[col].fillna(med)
![](https://image.jiqizhixin.com/uploads/editor/4b756734-d4b7-431f-b981-5f67e289b0b9/640.png)
# impute the missing values and create the missing value indicator variables for each non-numeric column. df_non_numeric = df.select_dtypes(exclude=[np.number]) non_numeric_cols = df_non_numeric.columns.values for col in non_numeric_cols: missing = df[col].isnull() num_missing = np.sum(missing) if num_missing > 0: # only do the imputation for the columns that have missing values. print('imputing missing values for: {}'.format(col)) df['{}_ismissing'.format(col)] = missing top = df[col].describe()['top'] # impute with the most frequent value. df[col] = df[col].fillna(top)
# categorical df['sub_area'] = df['sub_area'].fillna('_MISSING_') # numeric df['life_sq'] = df['life_sq'].fillna(-999)
# histogram of life_sq. df['life_sq'].hist(bins=100)
![](https://image.jiqizhixin.com/uploads/editor/cb1e3a77-fd52-4c7d-aa9a-a164a9aa08c4/640.png)
# box plot. df.boxplot(column=['life_sq'])
![](https://image.jiqizhixin.com/uploads/editor/6c6be3e2-d2e5-46b4-9815-3005b8607845/640.png)
df['life_sq'].describe()
![](https://image.jiqizhixin.com/uploads/editor/03d96c8a-fcf9-46d9-b5ea-5204097946b8/640.png)
# bar chart - distribution of a categorical variable df['ecology'].value_counts().plot.bar()
![](https://image.jiqizhixin.com/uploads/editor/32dd9f25-ed27-4f53-a1f2-6776200186ab/640.png)
num_rows = len(df.index) low_information_cols = [] # for col in df.columns: cnts = df[col].value_counts(dropna=False) top_pct = (cnts/num_rows).iloc[0] if top_pct > 0.95: low_information_cols.append(col) print('{0}: {1:.5f}%'.format(col, top_pct*100)) print(cnts) print()
![](https://image.jiqizhixin.com/uploads/editor/3dbe9747-be03-4427-8985-3922391d6e6d/640.png)
# we know that column 'id' is unique, but what if we drop it? df_dedupped = df.drop('id', axis=1).drop_duplicates() # there were duplicate rows print(df.shape) print(df_dedupped.shape)
![](https://image.jiqizhixin.com/uploads/editor/f2793b6c-b099-4c99-a280-3babd99837fb/640.png)
key = ['timestamp', 'full_sq', 'life_sq', 'floor', 'build_year', 'num_room', 'price_doc'] df.fillna(-999).groupby(key)['id'].count().sort_values(ascending=False).head(20)
![](https://image.jiqizhixin.com/uploads/editor/f8b1ff2d-dd45-4fa2-b910-66691aeee053/640.png)
# drop duplicates based on an subset of variables. key = ['timestamp', 'full_sq', 'life_sq', 'floor', 'build_year', 'num_room', 'price_doc'] df_dedupped2 = df.drop_duplicates(subset=key) print(df.shape) print(df_dedupped2.shape)
![](https://image.jiqizhixin.com/uploads/editor/56195ff6-285e-43b9-b91c-ff74787365bd/640.png)
df['sub_area'].value_counts(dropna=False)
![](https://image.jiqizhixin.com/uploads/editor/216570b2-c5e2-4806-91a9-2266997cadb3/640.png)
# make everything lower case. df['sub_area_lower'] = df['sub_area'].str.lower() df['sub_area_lower'].value_counts(dropna=False)
![](https://image.jiqizhixin.com/uploads/editor/c015b463-0bc8-43a4-849a-90fffac83077/640.png)
df
![](https://image.jiqizhixin.com/uploads/editor/e1c5a569-a465-40a5-a477-99e3ea517d02/640.png)
df['timestamp_dt'] = pd.to_datetime(df['timestamp'], format='%Y-%m-%d') df['year'] = df['timestamp_dt'].dt.year df['month'] = df['timestamp_dt'].dt.month df['weekday'] = df['timestamp_dt'].dt.weekday print(df['year'].value_counts(dropna=False)) print() print(df['month'].value_counts(dropna=False))
![](https://image.jiqizhixin.com/uploads/editor/612fc36c-b4f2-4ce9-8617-324e350f8824/640.png)
from nltk.metrics import edit_distance df_city_ex = pd.DataFrame(data={'city': ['torontoo', 'toronto', 'tronto', 'vancouver', 'vancover', 'vancouvr', 'montreal', 'calgary']}) df_city_ex['city_distance_toronto'] = df_city_ex['city'].map(lambda x: edit_distance(x, 'toronto')) df_city_ex['city_distance_vancouver'] = df_city_ex['city'].map(lambda x: edit_distance(x, 'vancouver')) df_city_ex
![](https://image.jiqizhixin.com/uploads/editor/d31ab7b3-a0b0-431d-b2df-4a53df99b151/640.png)
msk = df_city_ex['city_distance_toronto'] <= 2 df_city_ex.loc[msk, 'city'] = 'toronto' msk = df_city_ex['city_distance_vancouver'] <= 2 df_city_ex.loc[msk, 'city'] = 'vancouver' df_city_ex
![](https://image.jiqizhixin.com/uploads/editor/417f5523-5c67-42de-944e-36cb009500ea/640.png)
# no address column in the housing dataset. So create one to show the code. df_add_ex = pd.DataFrame(['123 MAIN St Apartment 15', '123 Main Street Apt 12 ', '543 FirSt Av', ' 876 FIRst Ave.'], columns=['address']) df_add_ex
![](https://image.jiqizhixin.com/uploads/editor/b6d065dc-1fb7-4b5f-a97e-29ab599360f1/640.png)
df_add_ex['address_std'] = df_add_ex['address'].str.lower() df_add_ex['address_std'] = df_add_ex['address_std'].str.strip() # remove leading and trailing whitespace.