常用命令:

读入文档
home_data = home_data = pd.read_csv(iowa_file_path)

SRE实战 互联网时代守护先锋,助力企业售后服务体系运筹帷幄!一键直达领取阿里云限量特价优惠。

显示dataframe的所有列的列名 raw.columns.values
显示dataframe一共有多少列 len(raw.columns)

#查看每个列有多少空值
missing_val_count_by_column = (home_data.isnull().sum())
print(missing_val_count_by_column)
print(missing_val_count_by_column[missing_val_count_by_column > 0])

#将原始dataframe复制一份
new_data = home_data.copy()

察看某一列空值个数
print(new_data['LotFrontage'].isnull().sum())

只保留数字属性的列
new_data_numeric = new_data.select_dtypes(exclude=['object'])


删除训练数据中没有标签的行,'SalePrice'为标签列
train_data.dropna(axis=0, subset=['SalePrice'], inplace=True)


Nan处理
https://www.jianshu.com/p/41039996d867
删除有空值的行:df.dropna(axis = 0)

#查看每列的数据类型,object一般表示该列含有字符串
train_predictors.dtypes

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
#用随机森林训练模型(这里使用的是训练数据的特征和标签),然后使用MAE分数评测模型,返回MAE分数
def get_mae(X, y):
# multiple by -1 to make positive MAE score instead of neg value returned as sklearn convention
return -1 * cross_val_score(RandomForestRegressor(50),
X, y,
scoring = 'neg_mean_absolute_error').mean()
predictors_without_categoricals = train_predictors.select_dtypes(exclude=['object'])
mae_without_categoricals = get_mae(predictors_without_categoricals, target)
mae_one_hot_encoded = get_mae(one_hot_encoded_training_predictors, target)
print('Mean Absolute Error when Dropping Categoricals: ' + str(int(mae_without_categoricals)))
print('Mean Abslute Error with One-Hot Encoding: ' + str(int(mae_one_hot_encoded)))

#列对齐,保证进行one-hot编码的列,测试集的列和训练集的列保持一致
one_hot_encoded_training_predictors = pd.get_dummies(train_predictors)
one_hot_encoded_test_predictors = pd.get_dummies(test_predictors)
final_train, final_test = one_hot_encoded_training_predictors.align(one_hot_encoded_test_predictors,
join='left',
axis=1)

#Pandas中把dataframe和np.array的相互转换
#dataframe转化成array
df=df.values
#array转化成dataframe
import pandas as pd
df = pd.DataFrame(df)

##########################检验空值并插值############
#1----------看看哪些列有空值-------------------#
print("#1")
missing_val_count_by_column = (X1.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])
print(X1.dtypes)

#对原始数据home_data中的所有空值进行插值(之前要确认target列没有空值,有的话需要drop空值这一行)
from sklearn.impute import SimpleImputer
my_imputer = SimpleImputer(strategy='most_frequent')
imputed_X1 = pd.DataFrame(my_imputer.fit_transform(X1))
print(imputed_X1.head())

#验证对全部数据差值后,所有列都没有空值了
missing_val_count_by_column2 = (imputed_X1.isnull().sum())
print(missing_val_count_by_column2[missing_val_count_by_column2 > 0])
#1----------------------------------------#
##########################检验空值并插值############

#在原始数据集基础上drop掉Id列和SalePrice列,保存为变量X1
X1=home_data.drop(['Id', 'SalePrice'], axis=1)


#2----------对imputed_X1中的object对象进行one hot编码-------------------#
print("#2")
low_cardinality_cols = [cname for cname in imputed_X1.columns if
imputed_X1[cname].nunique() < 10 and
imputed_X1[cname].dtype == "object"]
numeric_cols = [cname for cname in imputed_X1.columns if
imputed_X1[cname].dtype in ['int64', 'float64']]
print("low_cardinality_cols数量:" + str(len(low_cardinality_cols)))
print("numeric_cols数量:" + str(len(numeric_cols)))

my_cols = low_cardinality_cols + numeric_cols
candidate_X = imputed_X1[my_cols]
one_hot_encoded_X = pd.get_dummies(candidate_X)
print(one_hot_encoded_X.head())

missing_val_count_by_column3 = (one_hot_encoded_X.isnull().sum())
print(missing_val_count_by_column3[missing_val_count_by_column3 > 0])
#2----------------------------------------#

#将one_hot_encoded_test_X的属性向one_hot_encoded_X左对齐,如果没有这一步,one_hot_encoded_test_X的列数会比one_hot_encoded_X多
final_train, final_test = one_hot_encoded_X.align(one_hot_encoded_test_X,
join='left',
axis=1)
#将test_X和X的列统一
test_X = test_data[X.columns]

#找出在all_data_predictors中,含有空值(Nan)数据的列
cols_with_missing_in_all_data = [col for col in object_all_data_predictors.columns
if object_all_data_predictors[col].isnull().any()]
print("cols_with_missing_in_all_data长度:" + str(len(cols_with_missing_in_all_data)))

 

 

 

 

 

 

 

 

#7----------使用数值列和非空object列建模-------------------#
print("#7")
print("home_data.columns.values:" + str(len(home_data.columns.values)))
print("test_data.columns.values:" + str(len(test_data.columns.values)))

home_data_predictors = home_data.drop(['Id', 'SalePrice'], axis=1)
test_data_predictors = test_data.drop(['Id'], axis=1)
print("home_data_predictors.columns.values:" + str(len(home_data_predictors.columns.values)))
print("test_data_predictors.columns.values:" + str(len(test_data_predictors.columns.values)))
print("home_data_predictors:" + str(len(home_data_predictors)))
print("test_data_predictors:" + str(len(test_data_predictors)))

#将训练集和测试集append合并,存入all_data_predictors
all_data_predictors = home_data_predictors.append(test_data_predictors)
print("all_data_predictors.columns.values:" + str(len(all_data_predictors.columns.values)))
print("all_data_predictors:" + str(len(all_data_predictors)))

#只选择object类型的列,存入object_all_data_predictors
object_all_data_predictors = all_data_predictors.select_dtypes(include=['object'])
print("object_all_data_predictors.columns.values:" + str(len(object_all_data_predictors.columns.values)))
print("object_all_data_predictors:" + str(len(object_all_data_predictors)))

#只选择数字类型的列,存入numeric_all_data_predictors
numeric_all_data_predictors = all_data_predictors.select_dtypes(include=['number'])
print("numeric_all_data_predictors.columns.values:" + str(len(numeric_all_data_predictors.columns.values)))
print("numeric_all_data_predictors:" + str(len(numeric_all_data_predictors)))

#查看numeric_all_data_predictors中的有空值的列
missing_val_count_by_column = (numeric_all_data_predictors.isnull().sum())
print("numeric_all_data_predictors有空值的列:")
print(missing_val_count_by_column[missing_val_count_by_column > 0])

#对numeric_all_data_predictors进行插值,存入inputed_numeric_all_data_predictors
my_imputer = SimpleImputer()
inputed_numeric_all_data_predictors = pd.DataFrame(my_imputer.fit_transform(numeric_all_data_predictors))

#查看inputed_numeric_all_data_predictors中的有空值的列
missing_val_count_by_column = (inputed_numeric_all_data_predictors.isnull().sum())
print("inputed_numeric_all_data_predictors有空值的列:")
print(missing_val_count_by_column[missing_val_count_by_column > 0])

#找出在all_data_predictors中,含有空值(Nan)数据的列
cols_with_missing_in_all_data = [col for col in object_all_data_predictors.columns
if object_all_data_predictors[col].isnull().any()]
print("cols_with_missing_in_all_data长度:" + str(len(cols_with_missing_in_all_data)))

#将object_all_data_predictors中的含有空值的列drop
reduced_object_all_data_predictors = object_all_data_predictors.drop(cols_with_missing_in_all_data, axis=1)
print("reduced_object_all_data_predictors.columns.values:" + str(len(reduced_object_all_data_predictors.columns.values)))
print("reduced_object_all_data_predictors:" + str(len(reduced_object_all_data_predictors)))

#查看reduced_object_all_data_predictors中的有空值的列
missing_val_count_by_column = (reduced_object_all_data_predictors.isnull().sum())
print("reduced_object_all_data_predictors有空值的列:")
print(missing_val_count_by_column[missing_val_count_by_column > 0])

#将reduced_object_all_data_predictors中内容种类小于10的object列选出,存入low_reduced_object_all_data_predictors
low_cardinality_cols = [cname for cname in reduced_object_all_data_predictors.columns if
reduced_object_all_data_predictors[cname].nunique() < 10 and
reduced_object_all_data_predictors[cname].dtype == "object"]
low_reduced_object_all_data_predictors = reduced_object_all_data_predictors[low_cardinality_cols]

#对low_reduced_object_all_data_predictors进行one-hot编码,存入one_hot_object_predictors
one_hot_object_predictors = pd.get_dummies(low_reduced_object_all_data_predictors)
print("one_hot_object_predictors.columns.values:" + str(len(one_hot_object_predictors.columns.values)))
print("one_hot_object_predictors:" + str(len(one_hot_object_predictors)))


print("one_hot_object_predictors.index:")
print(one_hot_object_predictors.index)
print("inputed_numeric_all_data_predictors.index:")
print(inputed_numeric_all_data_predictors.index)
#将one_hot_object_predictors的index重置,否则concat会因为index不一致而出错
one_hot_object_predictors.reset_index(inplace=True, drop=True)
print("one_hot_object_predictors.index:")
print(one_hot_object_predictors.index)
print("inputed_numeric_all_data_predictors.index:")
print(inputed_numeric_all_data_predictors.index)


print("one_hot_object_predictors.columns.values:" + str(len(one_hot_object_predictors.columns.values)))
print("inputed_numeric_all_data_predictors.columns.values:" + str(len(inputed_numeric_all_data_predictors.columns.values)))
print("one_hot_object_predictors:" + str(len(one_hot_object_predictors)))
print("inputed_numeric_all_data_predictors:" + str(len(inputed_numeric_all_data_predictors)))

#将one_hot_object_predictors和inputed_numeric_all_data_predictors进行concat,存入final_all_data
final_all_data = pd.concat([one_hot_object_predictors, inputed_numeric_all_data_predictors], axis=1)
print("final_all_data.columns.values:" + str(len(final_all_data.columns.values)))
print("final_all_data:" + str(len(final_all_data)))

#将final_all_data按照原来的比例拆分成final_train_data和final_test_data
final_train_data = final_all_data.iloc[0:1460,:]
final_test_data = final_all_data.iloc[1460:2919,:]
print("final_train_data.columns.values:" + str(len(final_train_data.columns.values)))
print("final_test_data.columns.values:" + str(len(final_test_data.columns.values)))
print("final_train_data:" + str(len(final_train_data)))
print("final_test_data:" + str(len(final_test_data)))
#7----------------------------------------#
#################xgboost#############################
from xgboost import XGBRegressor
#my_model = XGBRegressor(n_estimators=1000)
my_model = XGBRegressor(n_estimators=1000, learning_rate=0.02)
# Add silent=True to avoid printing out updates with each cycle
my_model.fit(train_X, train_y,verbose=True)
my_model.fit(train_X, train_y, early_stopping_rounds=10,
eval_set=[(val_X, val_y)],verbose=False)
print("best_iteration:"+str(my_model.get_booster().best_iteration))
# make predictions
predictions = my_model.predict(val_X)
xgb_val_mae = mean_absolute_error(predictions, val_y)
print("Validation MAE for XGBoost Model: {:,.0f}".format(xgb_val_mae))

my_model2 = XGBRegressor(n_estimators=1000, learning_rate=0.05)
my_model2.fit(final_train_data, y,verbose=False)
test_preds = my_model.predict(final_test_data)
print("finish")

#察看df有多少行多少列
data.shape

#察看特征重要性
from xgboost import plot_importance

plot_importance(xgb_model_on_full_data, max_num_features = 20)

 

#Kernel:House Prices: 1st Approach to Data Science Process

#Seaborn其实是在matplotlib的基础上进行了更高级的API封装
import seaborn as sns

#jupyter notebook使用,可以直接在你的python console里面生成图像,用spyder可注释掉
%matplotlib inline

#数据分析目的
1. Gain a preliminary understanding of available data
2. Check for missing or null values
3. Find potential outliers
4. Assess correlations amongst attributes/features
5. Check for data skew

1 初步理解数据意思
2 查找缺失值
3 查找极端值
4 找到特征的相关性
5 查找数据倾斜

#特征分析注意事项
home_data.select_dtypes(exclude=['object']).columns
#这里可以选出数字列,但是如果有的数字列的某一行出现了脏数据,把该列化为了object,怎么避免这种情况?
#还有一些数字列,值虽然是数字,但确是离散值,这种数字列应该属于分类数据(categorical data)

#df.describe().round(decimals=2) 保留两位小数
home_data.select_dtypes(exclude=['object']).describe().round(decimals=2)

#查看分类数据列的基本信息
home_data.select_dtypes(include=['object']).describe()

#Skewed Data:偏斜数据
#由于数据中有些很大很大的极端值,使得整体平均数被极少数的极端大值拉大了
#偏斜数据会很大程度上影响线性回归的预测准确率,对决策树和随机森林影响不大

#显示某一列数据的分布
target = home_data.SalePrice
plt.figure()
sns.distplot(target)
plt.title('Distribution of SalePrice')
plt.show()

#对一列数值去对数,再显示数据分布
sns.distplot(np.log(target))
plt.title('Distribution of Log-transformed SalePrice')
plt.xlabel('log(SalePrice)')
plt.show()

#pd.skew():偏度,偏斜方向和程度,表征概率分布密度曲线相对于平均值不对称程度的特征数。直观看来就是密度函数曲线尾部的相对长度。
#对数据列取log,可以极大降低偏度

#显示每一个数字列的分布
num_attributes = home_data.select_dtypes(exclude='object').drop('SalePrice', axis=1).copy()

fig = plt.figure(figsize=(12,18)) #12,18指的是像素
for i in range(len(num_attributes.columns)): #便利所有数字列
fig.add_subplot(9,4,i+1) #9行4列共36个位置(数字列一共37列,抛去目标列,正好36列)
sns.distplot(num_attributes.iloc[:,i].dropna()) #drop掉空值后,显示这一列的数据分布
plt.xlabel(num_attributes.columns[i]) #显示这一列的列名

plt.tight_layout() #让subplot均匀分布在figure上
plt.show()

#Uni-modal:用单峰分布描述的

#遇到极端值的处理办法
1 查找源数据,是否错误录入
2 看看极端数据是否具有合理的解释
3 如果极端数据无法进一步检查,通常抛弃

#显示每一个数字属性列和target列的散点图,为了找到极端值
f = plt.figure(figsize=(12,20))

for i in range(len(num_attributes.columns)):
f.add_subplot(9, 4, i+1)
sns.scatterplot(num_attributes.iloc[:,i], target)

plt.tight_layout()
plt.show()

#显示所有属性列的相关性
#如果使用线性回归模型,一定要移除相关特征
correlation = home_data.corr()

f, ax = plt.subplots(figsize=(14,12))
plt.title('Correlation of numerical attributes', size=16)
sns.heatmap(correlation,annot=True)
plt.show()

#显示target列和其他列的相关性
correlation['SalePrice'].sort_values(ascending=False).head(15)

#显示每一个数字列和target列的散点图,标出每一个数字列和target列的相关性
num_columns = home_data.select_dtypes(exclude='object').columns
corr_to_price = correlation['SalePrice']
n_cols = 5
n_rows = 8
fig, ax_arr = plt.subplots(n_rows, n_cols, figsize=(16,20), sharey=True)
plt.subplots_adjust(bottom=-0.8)
for j in range(n_rows):
for i in range(n_cols):
plt.sca(ax_arr[j, i])
index = i + j*n_cols
if index < len(num_columns):
plt.scatter(home_data[num_columns[index]], home_data.SalePrice)
plt.xlabel(num_columns[index])
plt.title('Corr to SalePrice = '+ str(np.around(corr_to_price[index], decimals=3)))
plt.show()

#根据相关性矩阵,删除两个相关性高的特征中的一个,再把和target列相关性低的属性删除

#查找哪些数字列含有空值
num_attributes.isna().sum().sort_values(ascending=False).head()

#查找哪些object列含有空值
home_data[cat_columns].isna().sum().sort_values(ascending=False).head(17)
#在本例中,有些空值代表该房子没有这些设施,可以填入None

#给MasVnrArea属性中的空值赋值0
home_data_copy.MasVnrArea = home_data_copy.MasVnrArea.fillna(0)

#给object列中的空值赋值None
cat_cols_fill_none = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu',
'GarageCond', 'GarageQual', 'GarageFinish', 'GarageType',
'BsmtFinType2', 'BsmtExposure', 'BsmtFinType1', 'BsmtQual', 'BsmtCond',
'MasVnrType']
for cat in cat_cols_fill_none:
home_data_copy[cat] = home_data_copy[cat].fillna("None")


#根据各个列和target列的散点图,删除极端值
home_data_copy = home_data_copy.drop(home_data_copy['LotFrontage']
[home_data_copy['LotFrontage']>200].index)
home_data_copy = home_data_copy.drop(home_data_copy['LotArea']
[home_data_copy['LotArea']>100000].index)

#将target列取log,并改列名
home_data_copy['SalePrice'] = np.log(home_data_copy['SalePrice'])
home_data_copy = home_data_copy.rename(columns={'SalePrice': 'SalePrice_log'})

#simple imputer之后,保留列名
my_imputer = SimpleImputer()
train_X_inputed = my_imputer.fit_transform(train_X)
val_X_inputed = my_imputer.transform(val_X)

train_X_inputed = pd.DataFrame(train_X_inputed, columns = train_X.columns)
val_X_inputed = pd.DataFrame(val_X_inputed, columns = val_X.columns)
print(train_X_inputed.head(5))

# 训练集和测试集列对其
X = home_data_copy.drop(attributes_drop, axis=1)
X = pd.get_dummies(X)
test_X = test_data.drop(attributes_drop, axis=1)
test_X = pd.get_dummies(test_X)

final_train, final_test = X.align(test_X, join='left', axis=1)
final_model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
final_test_imputed = my_imputer.transform(final_test)
final_train_imputed = my_imputer.fit_transform(final_train)
final_model.fit(final_train_imputed, y)
test_preds = final_model.predict(final_test_imputed)


#get_dummies对缺失值的处理
忽略缺失值:test_X = pd.get_dummies(test_X)
将缺失值设为Nan:test_X = pd.get_dummies(test_X, dummy_na=True)

#图形显示特征重要性
import matplotlib.pyplot as plt
fig,ax = plt.subplots(figsize=(15,15))
plot_importance(xgb_model,
height=0.5,
ax=ax,
max_num_features=64)

#pd.get_dummies(X, dummy_na=True)操作,会把object列中的Nan值全部消除,再用SimpleImputer给数字列插值

#选择feature_importances_分数高的前30个特征
final_model.fit(final_train_imputed, y)
test_preds = final_model.predict(final_test_imputed)

thresholds = sort(final_model.feature_importances_)

selection = SelectFromModel(final_model, threshold=thresholds[-30], prefit=True)
select_final_train = selection.transform(final_train_imputed)
print(select_final_train)

feature_idx = selection.get_support()
feature_name = final_train_imputed.columns[feature_idx]
print(feature_name)

#给最终的测试集和训练集加上列名,从array变回dataframe
final_test_imputed = pd.DataFrame(final_test_imputed, columns = final_test.columns)
final_train_imputed = pd.DataFrame(final_train_imputed, columns = final_train.columns)

#选择使用哪些特征列,thresholds[0]表示选择所有列,thresholds[-1]表示只选择相关性最高的列
print(thresholds[-85])
print("len(thresholds):"+str(len(thresholds)))
selection = SelectFromModel(final_model, threshold=thresholds[-85], prefit=True)
select_final_train = selection.transform(final_train_imputed)
select_final_test = selection.transform(final_test_imputed)

#由于selection.transform操作后,列名会消失,所以用feature_name保存列名
feature_idx = selection.get_support()
feature_name = final_train_imputed.columns[feature_idx]
print(feature_name)

#筛选特征时的策略
1 Id列没有作用,可以drop
2 找出每个列中的极端值,drop掉
3 将target列取log(1+x)
4 将训练集和测试集concat到一起处理
5 有些列虽然是数字型,但实际意思确是字符串型,需要转化为字符串(例如年,月等)
6 查看每列的特点,对每列的缺失值进行填充
7 高斜度特征,做boxcox1p变换
8 特征列只有一种值,可以drop掉
9 同类型的列,数值可以相加,构成一个新的列
10 如果一列是另外两列的和,保留和这一列,drop另外两列(这个不确定)
11 某些列的数据,可以这样简化:大于0的设置为1,剩下的为0
12 如果某一列中,其中一个值的出现频率大于99.94,则把这一列drop

#drop掉Id列的操作
train.drop(['Id'], axis=1, inplace=True)
test.drop(['Id'], axis=1, inplace=True)

#drop掉极端值(删除行操作后,需要reset_index。drop=True表示旧的index被舍弃
train = train[train.GrLivArea < 4500]
train.reset_index(drop=True, inplace=True)

#将训练集和测试集concat到一起处理
features = pd.concat([train_features, test_features]).reset_index(drop=True)
print(features.shape)

#将数字型转为字符型
features['MSSubClass'] = features['MSSubClass'].apply(str)
features['YrSold'] = features['YrSold'].astype(str)
features['MoSold'] = features['MoSold'].astype(str)

#将某列缺失值填充固定内容
features['Functional'] = features['Functional'].fillna('Typ')

#将某列缺失值填充列中出现次数最多的内容
features['Exterior1st'] = features['Exterior1st'].fillna(features['Exterior1st'].mode()[0])

#使用for填充缺失值
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
features[col] = features[col].fillna(0)

#将训练集按照MSSubClass列分组,MSZoning的缺失值填入每组中出现次数最多的
features['MSZoning'] = features.groupby('MSSubClass')['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))

#查找列中所有的唯一值
features['MSZoning'].unique()

# 选择所有的object列
objects = []
for i in features.columns:
if features[i].dtype == object:
objects.append(i)

或者这样

obj_features = features.select_dtypes(include='object')
#再显示obj_features每个列的空值
print(obj_features.isna().sum().sort_values(ascending=False))

#填充缺失值后,需要update原始dataframe
features.update(features[objects].fillna('None'))

#将训练集按照Neighborhood列分组,再把每组的平均值赋给LotFrontage列的缺失值
features['LotFrontage'] = features.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

#查看每一列的属性
features.dtypes

#查看列种类的唯一值
features.dtypes.unique()

#查看每一列的斜度(skew)
skew_features = features[numerics2].apply(lambda x: skew(x)).sort_values(ascending=False)
print(skew_features)

#处理高斜度特征
skew_features = features[numerics2].apply(lambda x: skew(x)).sort_values(ascending=False)
#print(skew_features)
high_skew = skew_features[skew_features > 0.5]
skew_index = high_skew.index
for i in skew_index:
features[i] = boxcox1p(features[i], boxcox_normmax(features[i] + 1))


#同类型的列相加
features['Total_Bathrooms'] = (features['FullBath'] + (0.5 * features['HalfBath']) +
features['BsmtFullBath'] + (0.5 * features['BsmtHalfBath']))

#简化列的数据
features['haspool'] = features['PoolArea'].apply(lambda x: 1 if x > 0 else 0)

#将合并后的features拆分成训练集和数据集
X = final_features.iloc[:len(y), :]
X_sub = final_features.iloc[len(X):, :]

#如果某一列中,其中一个值的出现频率大于99.94,则把这一列drop
overfit = []
for i in X.columns:
counts = X[i].value_counts()
#print("counts:")
#print(counts)
zeros = counts.iloc[0]
#print("zeros:")
#print(zeros)
if zeros / len(X) * 100 > 99.94:
overfit.append(i)
#break


overfit = list(overfit)
overfit.append('MSZoning_C (all)')
print(overfit)

X = X.drop(overfit, axis=1).copy()
X_sub = X_sub.drop(overfit, axis=1).copy()

#当shuffle=True时,random_state起作用
kfolds = KFold(n_splits=10, shuffle=True, random_state=42)


#模型评估方法
1 Root Mean Squared Logarithmic Error (RMSLE) 均方根对数误差

def rmsle(y, y_pred):
return np.sqrt(mean_squared_error(y, y_pred))

2 Root Mean Squard Error (RMSE) 均方根误差
def cv_rmse(model):
rmse = np.sqrt(-cross_val_score(model, X, y,
scoring="neg_mean_squared_error",
cv=kfolds))
return rmse

#处理数据集中异常值的方法RobustScaler
ridge = make_pipeline(RobustScaler(),
RidgeCV(alphas=alphas_alt, cv=kfolds))

#数据标准化方法
1 如果要做中心化处理,并且对数据分布有正态需求,那么使用Z-Score方法;
2 如果要进行0-1标准化或将要指定标准化后的数据分布范围,那么使用Max-Min标准化或MaxAbs标准化方式是比较好的选择,尤其是前者;
3 如果要对稀疏数据进行处理,Max-Min标准化或MaxAbs标准化仍然是理想方法;
4 如果要最大限度的保留数据集中的异常,那么使用RobustScaler方法更加。


#分析训练数据,特征工程步骤
1 df_train['SalePrice'].describe()
查看最大值,最小值,有没有明显不符合逻辑的

2 sns.distplot(df_train['SalePrice']);
2.1 查看是否脱离正态分布
2.2 查看是正偏态(positive skewness)还是负偏态(negative skewness)
2.3 查看峰度(peakedness),峰度越高(图像越尖),表示方差增大是由低频度的大于或小于平均值的极端差值引起的。

3 print("Skewness: %f" % df_train['SalePrice'].skew())
print("Kurtosis: %f" % df_train['SalePrice'].kurt())
显示偏度和峰度

4 显示目标列和某一特征列(数字)的散点图
var = 'GrLivArea'
data = pd.concat([df_train['SalePrice'], df_train[var]], axis=1)
data.plot.scatter(x=var, y='SalePrice', ylim=(0,800000));
根据图像可判断出是否线性相关

5 显示目标列和某一特征列(分类)的箱形图
var = 'OverallQual'
data = pd.concat([df_train['SalePrice'], df_train[var]], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x=var, y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);

6 显示相关矩阵(热图,heatmap)
corrmat = df_train.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True);
看这个图 如果有两个属性是强相关的,只选择其中一个属性,抛弃另一个属性

7 显示和目标列最相关的n个属性的相关矩阵
k = 10 #number of variables for heatmap
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(df_train[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()
看这个图,可以知道哪些列与目标列的相关性最强

8 显示多个列之间的散点图
sns.set()
cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']
sns.pairplot(df_train[cols], size = 2.5)
plt.show();

9 查看含有空值最多的前20列
total = df_train.isnull().sum().sort_values(ascending=False)
percent = (df_train.isnull().sum()/df_train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)
如果空值率超过15%,可以考虑删除这一列
如果某些列的空值数量一样,考虑这些列的内在逻辑关系
如果某列只有少量空值,删除这些含有空值的行

10 查看目标列极端值,将目标列标准化
saleprice_scaled = StandardScaler().fit_transform(df_train['SalePrice'][:,np.newaxis]);
low_range = saleprice_scaled[saleprice_scaled[:,0].argsort()][:10]
high_range= saleprice_scaled[saleprice_scaled[:,0].argsort()][-10:]
print('outer range (low) of the distribution:')
print(low_range)
print('\nouter range (high) of the distribution:')
print(high_range)
看看最大的10个值,和最小的10个值,和0的差距大不大

11 查看某一列和目标列的散点图,找到离群点
var = 'GrLivArea'
data = pd.concat([df_train['SalePrice'], df_train[var]], axis=1)
data.plot.scatter(x=var, y='SalePrice', ylim=(0,800000));
如果有离群点明显不符合大部分点的趋势,可以删除
#deleting points
df_train.sort_values(by = 'GrLivArea', ascending = False)[:2]
df_train = df_train.drop(df_train[df_train['Id'] == 1299].index)
df_train = df_train.drop(df_train[df_train['Id'] == 524].index)

12 查看目标列的峰度和偏度,以及是否符合一条线
sns.distplot(df_train['SalePrice'], fit=norm);
fig = plt.figure()
res = stats.probplot(df_train['SalePrice'], plot=plt)

13 如果特征列是正偏态,可以取log消除正偏态
df_train['SalePrice'] = np.log(df_train['SalePrice'])

sns.distplot(df_train['SalePrice'], fit=norm);
fig = plt.figure()
res = stats.probplot(df_train['SalePrice'], plot=plt)

14 如果特征列含有很多0值,无法取log,可以增加一列标记0和非0,然后把非0值取log

House Prices EDA分析数据过程

1 列出:
1.1 训练数据多少行
1.2 测试数据多少行
1.3 多少种属性
1.4 数字型属性多少种
1.5 分类型属性多少种

2 训练数据有多少空值
missing = train.isnull().sum()
missing = missing[missing > 0]
missing.sort_values(inplace=True)
missing.plot.bar()

结合列名判断,空值是该属性不存在,还是漏填

3 目标列是否符合正态分布
import scipy.stats as st
y = train['SalePrice']
plt.figure(1); plt.title('Johnson SU')
sns.distplot(y, kde=False, fit=st.johnsonsu)
plt.figure(2); plt.title('Normal')
sns.distplot(y, kde=False, fit=st.norm)
plt.figure(3); plt.title('Log Normal')
sns.distplot(y, kde=False, fit=st.lognorm)
如果不符合正太分布,需要log

4 所有数字列中符合正态分布的列
test_normality = lambda x: stats.shapiro(x.fillna(0))[1] < 0.01
normal = pd.DataFrame(train[quantitative])
normal = normal.apply(test_normality)
print(not normal.any())

5 显示所有数字列的分布
f = pd.melt(train, value_vars=quantitative)
g = sns.FacetGrid(f, col="variable", col_wrap=2, sharex=False, sharey=False)
g = g.map(sns.distplot, "value")


dop掉极端值的方法
train.drop(train[(train["GrLivArea"]>4000)&(train["SalePrice"]<300000)].index,inplace=True)

concat后保持index连续
full=pd.concat([train,test], ignore_index=True)

将一列连续数字值分10段
full["LotAreaCut"] = pd.qcut(full.LotArea,10)

按照分段的平均值插值
full['LotFrontage']=full.groupby(['LotAreaCut','Neighborhood'])['LotFrontage'].transform(lambda x: x.fillna(x.median()))
full['LotFrontage']=full.groupby(['LotAreaCut'])['LotFrontage'].transform(lambda x: x.fillna(x.median()))

填充缺失值
cols=["MasVnrArea", "BsmtUnfSF", "TotalBsmtSF", "GarageCars", "BsmtFinSF2", "BsmtFinSF1", "GarageArea"]
for col in cols:
full[col].fillna(0, inplace=True)

cols1 = ["PoolQC" , "MiscFeature", "Alley", "Fence", "FireplaceQu", "GarageQual", "GarageCond", "GarageFinish", "GarageYrBlt", "GarageType", "BsmtExposure", "BsmtCond", "BsmtQual", "BsmtFinType2", "BsmtFinType1", "MasVnrType"]
for col in cols1:
full[col].fillna("None", inplace=True)

cols2 = ["MSZoning", "BsmtFullBath", "BsmtHalfBath", "Utilities", "Functional", "Electrical", "KitchenQual", "SaleType","Exterior1st", "Exterior2nd"]
for col in cols2:
full[col].fillna(full[col].mode()[0], inplace=True)


找到离散型的数值列
num_columns = full.select_dtypes(exclude='object').columns
discrete = []
for var in num_columns:
if len(full[var].unique())<20:
print(var, ' values: ', full[var].unique())
discrete.append(var)

print('There are {} discrete variables'.format(len(discrete)))


对离散型数字属性,按照属性列分组排序顺序编码
def encode(frame, feature):
ordering = pd.DataFrame()
ordering['val'] = frame[feature].unique()
ordering.index = ordering.val
ordering['spmean'] = frame[[feature, 'SalePrice']].groupby(feature).mean()['SalePrice']
ordering = ordering.sort_values('spmean')
ordering['ordering'] = range(1, ordering.shape[0]+1)
ordering = ordering['ordering'].to_dict()

for cat, o in ordering.items():
frame.loc[frame[feature] == cat, feature+'_E'] = o
qualitative=['MSSubClass']
qual_encoded = []
for q in qualitative:
encode(full, q)
qual_encoded.append(q+'_E')
print(qual_encoded)
full['MSSubClass_E']

数据处理管道
class labelenc(BaseEstimator, TransformerMixin):
def __init__(self):
pass

def fit(self,X,y=None):
return self

def transform(self,X):
lab=LabelEncoder()
X["YearBuilt"] = lab.fit_transform(X["YearBuilt"])
X["YearRemodAdd"] = lab.fit_transform(X["YearRemodAdd"])
X["GarageYrBlt"] = lab.fit_transform(X["GarageYrBlt"])
return X
class skew_dummies(BaseEstimator, TransformerMixin):
def __init__(self,skew=0.5):
self.skew = skew

def fit(self,X,y=None):
return self

def transform(self,X):
X_numeric=X.select_dtypes(exclude=["object"])
skewness = X_numeric.apply(lambda x: skew(x))
skewness_features = skewness[abs(skewness) >= self.skew].index
X[skewness_features] = np.log1p(X[skewness_features])
X = pd.get_dummies(X)
return X
pipe = Pipeline([
('labenc', labelenc()),
('skew_dummies', skew_dummies(skew=1)),
])
full2 = full.copy()
data_pipe = pipe.fit_transform(full2)
data_pipe.shape

消除极端值
scaler = RobustScaler()
n_train=train.shape[0]

X = data_pipe[:n_train]
test_X = data_pipe[n_train:]
y= train.SalePrice

X_scaled = scaler.fit(X).transform(X)
y_log = np.log(train.SalePrice)
test_X_scaled = scaler.transform(test_X)


添加特征管道
class add_feature(BaseEstimator, TransformerMixin):
def __init__(self,additional=1):
self.additional = additional

def fit(self,X,y=None):
return self

def transform(self,X):
if self.additional==1:
X["TotalHouse"] = X["TotalBsmtSF"] + X["1stFlrSF"] + X["2ndFlrSF"]
X["TotalArea"] = X["TotalBsmtSF"] + X["1stFlrSF"] + X["2ndFlrSF"] + X["GarageArea"]

else:
X["TotalHouse"] = X["TotalBsmtSF"] + X["1stFlrSF"] + X["2ndFlrSF"]
X["TotalArea"] = X["TotalBsmtSF"] + X["1stFlrSF"] + X["2ndFlrSF"] + X["GarageArea"]

X["+_TotalHouse_OverallQual"] = X["TotalHouse"] * X["OverallQual"]
X["+_GrLivArea_OverallQual"] = X["GrLivArea"] * X["OverallQual"]
X["+_oMSZoning_TotalHouse"] = X["oMSZoning"] * X["TotalHouse"]
X["+_oMSZoning_OverallQual"] = X["oMSZoning"] + X["OverallQual"]
X["+_oMSZoning_YearBuilt"] = X["oMSZoning"] + X["YearBuilt"]
X["+_oNeighborhood_TotalHouse"] = X["oNeighborhood"] * X["TotalHouse"]
X["+_oNeighborhood_OverallQual"] = X["oNeighborhood"] + X["OverallQual"]
X["+_oNeighborhood_YearBuilt"] = X["oNeighborhood"] + X["YearBuilt"]
X["+_BsmtFinSF1_OverallQual"] = X["BsmtFinSF1"] * X["OverallQual"]

X["-_oFunctional_TotalHouse"] = X["oFunctional"] * X["TotalHouse"]
X["-_oFunctional_OverallQual"] = X["oFunctional"] + X["OverallQual"]
X["-_LotArea_OverallQual"] = X["LotArea"] * X["OverallQual"]
X["-_TotalHouse_LotArea"] = X["TotalHouse"] + X["LotArea"]
X["-_oCondition1_TotalHouse"] = X["oCondition1"] * X["TotalHouse"]
X["-_oCondition1_OverallQual"] = X["oCondition1"] + X["OverallQual"]


X["Bsmt"] = X["BsmtFinSF1"] + X["BsmtFinSF2"] + X["BsmtUnfSF"]
X["Rooms"] = X["FullBath"]+X["TotRmsAbvGrd"]
X["PorchArea"] = X["OpenPorchSF"]+X["EnclosedPorch"]+X["3SsnPorch"]+X["ScreenPorch"]
X["TotalPlace"] = X["TotalBsmtSF"] + X["1stFlrSF"] + X["2ndFlrSF"] + X["GarageArea"] + X["OpenPorchSF"]+X["EnclosedPorch"]+X["3SsnPorch"]+X["ScreenPorch"]


return X
pipe = Pipeline([
('labenc', labelenc()),
('add_feature', add_feature(additional=2)),
('skew_dummies', skew_dummies(skew=1)),
])
full_pipe = pipe.fit_transform(full)
full_pipe.shape

PCA
pca = PCA(n_components=410)

X_scaled=pca.fit_transform(X_scaled)
test_X_scaled = pca.transform(test_X_scaled)
X_scaled.shape, test_X_scaled.shape

散点图:测试集和训练集一起显示
def plot_feature_scatter(df1, df2, features):
i = 0
sns.set_style('whitegrid')
plt.figure()
fig, ax = plt.subplots(4,4,figsize=(14,14))

for feature in features:
i += 1
plt.subplot(4,4,i)
plt.scatter(df1[feature], df2[feature], marker='+')
plt.xlabel(feature, fontsize=9)
plt.show();

features = ['var_0', 'var_1','var_2','var_3', 'var_4', 'var_5', 'var_6', 'var_7',
'var_8', 'var_9', 'var_10','var_11','var_12', 'var_13', 'var_14', 'var_15',
]
plot_feature_scatter(train_df[::20],test_df[::20], features)#::20表示每隔20行取一行,即取全部数据的5%


@每轮5次,执行5轮,共执行25次
RepeatedStratifiedKFold(n_splits=5, n_repeats=5,random_state=10)

@每轮12次,执行1轮,共执行12次
StratifiedKFold(n_splits=12, shuffle=False, random_state=99999)

@选择某一列,求频率
train_df[train_df.target==0].iloc[:,2:3].apply(pd.value_counts).head(10)

@隔行取
test_copy=test_copy[::1000]
test_copy.reset_index(drop=True, inplace=True)

@求df中每个元素在他所在列出现的次数,添加到新的列中
m=len(train_copy)
cols =['var_0','var_2','var_6','var_9','var_13','var_53','var_76','var_80','var_81','var_108','var_123','var_139','var_191','var_198']
for col in cols:
for n in range(m):
train_copy.loc[n,col+'_fre']=train_copy.groupby([col]).size()[train_copy[col][n]]
if n%100==0:
print("train%",n/m*100)


@对每一行进行筛选,找出这一行数据,至少有一个,在某一列中有只出现1次的唯一值
1 找到每列的唯一值,index_存该列每个唯一值的index,count_存每个唯一值出现的次数
2 index_[count_ == 1]找到只出现1次的唯一值的index
3 unique_count[index_[count_ == 1], feature] += 1 在这一列,只要出现1次唯一值位置,设置数值为1
4 循环0-199列,每一列都实行步骤3,这样每一列的所有位置,只要出现1次唯一值位置,设置数值为1
5 np.argwhere(np.sum(unique_count, axis=1) > 0)[:, 0] 对unique_count的每一行求和,只要这一行任意一列有个1,说明这一行数据至少有一个值,是在这个数据所在列是惟一的
test_path = '../input/test.csv'
df_test = pd.read_csv(test_path)
df_test.drop(['ID_code'], axis=1, inplace=True)
df_test = df_test.values
unique_samples = []
unique_count = np.zeros_like(df_test)
for feature in range(df_test.shape[1]):
_, index_, count_ = np.unique(df_test[:, feature], return_counts=True, return_index=True)
unique_count[index_[count_ == 1], feature] += 1
# Samples which have unique values are real the others are fake
real_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) > 0)[:, 0]
synthetic_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) == 0)[:, 0]
print('Found',len(real_samples_indexes),'real test')
print('Found',len(synthetic_samples_indexes),'fake test')


@对每一行进行筛选,找出这一行数据,至少有一个,在某一列中有只出现1次的唯一值
hist_df = pd.DataFrame()
for var in features:
var_stats = train_df[var].append(test_df[var]).value_counts()
hist_df[var] = pd.Series(test_df[var]).map(var_stats)
hist_df[var] = hist_df[var] > 1

ind = hist_df.sum(axis=1) != 200
var_stats = {var:train_df[var].append(test_df[ind][var]).value_counts() for var in features}


@对每一列都增加一列,放在这列旁边,显示这个数字在这一列中出现的次数
# FREQUENCY ENCODE
def encode_FE(df,col,test):
cv = df[col].value_counts()
nm = col+'_FE'
df[nm] = df[col].map(cv)#这一句是这个函数的关键
#df[col].map(df.groupby(col).size())
test[nm] = test[col].map(cv)
test[nm].fillna(0,inplace=True)
if cv.max()<=255:
df[nm] = df[nm].astype('uint8')
test[nm] = test[nm].astype('uint8')
else:
df[nm] = df[nm].astype('uint16')
test[nm] = test[nm].astype('uint16')
return

test['target'] = -1
comb = pd.concat([train,test.loc[real_samples_indexes]],axis=0,sort=True)
for i in range(200): encode_FE(comb,'var_'+str(i),test)
train = comb[:len(train)]; del comb
print('Added 200 new magic features!')

@随机取dataframe的一部分,frac代表百分比
train2 = train.sample(frac=1,random_state=42)

@OOF simply stands for Out-of-fold and refers to a step in the learning process when using
k-fold validation in which the predictions from each set of folds are grouped together into
one group of 1000 predictions. These predictions are now "out-of-the-folds" and thus error
can be calculated on these to get a good measure of how good your model is.


@找到df中某一列的唯一值Series的index_和唯一值出现的次数count_,得到唯一值Series:uv,再创建一列,表明该值是否为唯一值
_, index_, count_ = np.unique(df.loc[:,col].values, return_counts=True, return_index=True)
uv = df[col][index_[count_ == 1]]
df[col+'_IsUnique'] = df[col].isin(uv.tolist())
test[col+'_IsUnique'] = test[col].isin(uv.tolist())

 


@批量修改datafram中某一列

在数据处理过程中,经常会出现对某列批量做某些操作,比如dataframe df要对列名为“values”做大于等于30设置为1,小于30设置为0操作,可以这样使用dataframe的apply函数来实现,具体实现代码如下:

def fun(x):
if x >= 30:
return 1
else:
return 0

values= feature['values'].apply(lambda x: fun(x))
#若需要将改动赋值给原始的feature的列中的话,可以进行一次赋值
feature['values']=values
#或者直接一次修改后赋值。
feature['values']= feature['values'].apply(lambda x: fun(x))

@numpy.reshape,当reshape的一个参数为-1时,表示先确定另一个参数,再自动确定这一个参数
@下面的例子,我要将numpy变成1列的,再自动生成多少行,所以列位置的参数设置1,行位置的参数设置-1
train_df[var].values.reshape(-1,1)

@读入数据的index是时间数据
fifa_data = pd.read_csv(fifa_filepath, index_col="Date", parse_dates=True)

扫码关注我们
微信号:SRE实战
拒绝背锅 运筹帷幄