包含数据集的下载、加载、训练集/测试集拆分、数据可视化、数据清洗(属性选择、空值处理、分类数值转换、数值缩放)、基本的模型训练(线性回归、决策树、随机森林)、模型评估(交叉验证)、模型调参(网格搜索、随机搜索)
代码有点乱,能运行。
import os import tarfile import pandas as pd from six.moves import urllib DOWNLOAD_ROOT="https://raw.githubusercontent.com/ageron/handson-ml/master/" HOUSING_PATH="datasets/housing" HOUSING_URL=DOWNLOAD_ROOT+HOUSING_PATH+"/housing.tgz" def fetch_housing_data(housing_url=HOUSING_URL,housing_path=HOUSING_PATH): if not os.path.isdir(housing_path): os.makedirs(housing_path) tgz_path=os.path.join(housing_path,"housing.tgz") urllib.request.urlretrieve(housing_url,tgz_path) print("downloaded!") housing_tgz=tarfile.open(tgz_path) housing_tgz.extractall(path=housing_path) housing_tgz.close() print("extracted!") def load_housing_data(housing_path=HOUSING_PATH): csv_path=os.path.join(housing_path,"housing.csv") return pd.read_csv(csv_path) housing=load_housing_data() #housing.describe() #housing.info() #import matplotlib.pyplot as plt #housing.hist(bins=50,figsize=(20,15)) #plt.show() import numpy as np import hashlib def test_set_check(identifier,test_ratio,hash): return hash(np.int64(identifier)).digest()[-1]<256*test_ratio def split_train_test_by_id(data,test_ratio,id_column,hash=hashlib.md5): ids=data[id_column] in_test_set=ids.apply(lambda id_:test_set_check(id_,test_ratio,hash)) return data.loc[~in_test_set],data.loc[in_test_set] housing_with_id=housing.reset_index() #train test split with longitude and latitude housing_with_id["id"]=housing["longitude"]*1000+housing["latitude"] train_set,test_set=split_train_test_by_id(housing_with_id,0.2,"id") #train test split with sklearn from sklearn.model_selection import train_test_split train_set,test_set=train_test_split(housing,test_size=0.2,random_state=42) print(len(train_set),"train+",len(test_set),"test") housing["income_cat"]=np.ceil(housing["median_income"]/1.5) housing["income_cat"].where(housing["income_cat"]<5,5.0,inplace=True) from sklearn.model_selection import StratifiedShuffleSplit split=StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42) for train_index,test_index in split.split(housing,housing["income_cat"]): strat_train_set=housing.loc[train_index] strat_test_set=housing.loc[test_index] housing["income_cat"].value_counts()/len(housing) strat_train_set["income_cat"].value_counts()/len(strat_train_set) strat_test_set["income_cat"].value_counts()/len(strat_test_set) #remove the income_cat attribute for set in (strat_train_set,strat_test_set): set.drop(["income_cat"],axis=1,inplace=True) #visualize the data import matplotlib.pyplot as plt housing=strat_train_set.copy() housing.plot(kind="scatter",x="longitude",y="latitude",alpha=0.4, s=housing["population"]/100,label="population",c="median_house_value", cmap=plt.get_cmap("jet"),colorbar=True,) plt.legend() #seperate the predictors and the labels housing=strat_train_set.drop("median_house_value",axis=1) housing_labels=strat_train_set["median_house_value"].copy() # median=housing["total_bedrooms"].median() # housing["total_bedrooms"].fillna(median) from sklearn.impute import SimpleImputer imputer=SimpleImputer(strategy="median") housing_num=housing.drop("ocean_proximity",axis=1) imputer.fit(housing_num) #imputer.statistics_ X=imputer.transform(housing_num) housing_tr=pd.DataFrame(X,columns=housing_num.columns) housing_tr.describe() #Label Encoder from sklearn.preprocessing import LabelEncoder encoder=LabelEncoder() housing_cat=housing["ocean_proximity"]; housing_cat_encoded=encoder.fit_transform(housing_cat) housing_cat_encoded from sklearn.preprocessing import OneHotEncoder encoder=OneHotEncoder() housing_cat_1hot=encoder.fit_transform(housing_cat_encoded.reshape(-1,1)) housing_cat_1hot.toarray() from sklearn.preprocessing import LabelBinarizer encoder=LabelBinarizer() housing_cat_1hot=encoder.fit_transform(housing_cat) housing_cat_1hot #总的管线 from sklearn.base import BaseEstimator,TransformerMixin class DataFrameSelector(BaseEstimator,TransformerMixin): def __init__(self,attribute_names): self.attribute_names=attribute_names def fit(self,X,y=None): return self def transform(self,X): return X[self.attribute_names].values class MyLabelBinarizer(TransformerMixin): def __init__(self,*args,**kwargs): self.encoder=LabelBinarizer(*args,**kwargs) def fit(self,X,y=0): self.encoder.fit(X) return self def transform(self,X,y=0): return self.encoder.transform(X) from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.pipeline import FeatureUnion num_attribs=list(housing_num) cat_attribs=["ocean_proximity"] num_pipeline=Pipeline([ ("selector",DataFrameSelector(num_attribs)), ("imputer",SimpleImputer(strategy="median")), ("std_scaler",StandardScaler()) ]) cat_pipeline=Pipeline([ ("selector",DataFrameSelector(cat_attribs)), ("label_binarizer",MyLabelBinarizer()) ]) full_pipeline=FeatureUnion(transformer_list=[ ("num_pipeline",num_pipeline), ("cat_pipeline",cat_pipeline) ]) housing_prepared=full_pipeline.fit_transform(housing) #训练模型 from sklearn.linear_model import LinearRegression from sklearn.externals import joblib lin_reg=LinearRegression() lin_reg.fit(housing_prepared,housing_labels) #try predict some_data=housing.iloc[:5] some_data_labels=housing_labels.iloc[:5] some_data_prepared=full_pipeline.transform(some_data) print("Predictions\t",lin_reg.predict(some_data_prepared)) print("Labels:\t",list(some_data_labels)) #保存模型 joblib.dump(lin_reg,"my_lin_reg_model.pkl") ''' #分析RMSE from sklearn.metrics import mean_squared_error housing_prediction=lin_reg.predict(housing_prepared) lin_mse=mean_squared_error(housing_prediction,housing_labels) lin_rmse=np.sqrt(lin_mse) lin_rmse ''' def display_scores(scores): print("Scores:",scores) print("Mean:",scores.mean()) print("Standard Deviation:",scores.std()) #采用决策树训练模型 from sklearn.tree import DecisionTreeRegressor from sklearn.metrics import mean_squared_error tree_reg=DecisionTreeRegressor() tree_reg.fit(housing_prepared,housing_labels) joblib.dump(tree_reg,"my_tree_reg_model.pkl") #保存 tree_reg_loaded=joblib.load("my_tree_reg_model.pkl") #加载保存的模型 housing_prediction=tree_reg.predict(housing_prepared) tree_mse=mean_squared_error(housing_prediction,housing_labels) tree_rmse=np.sqrt(tree_mse) print("tree_rmse",tree_rmse) #采用交叉验证来评估决策树模型 from sklearn.model_selection import cross_val_score scores=cross_val_score(tree_reg,housing_prepared,housing_labels,scoring="neg_mean_squared_error",cv=10) rmse_scores=np.sqrt(-scores) display_scores(rmse_scores) #采用交叉验证来评估线性回归模型 lin_scores=cross_val_score(lin_reg,housing_prepared,housing_labels,scoring="neg_mean_squared_error",cv=10) lin_rmse_scores=np.sqrt(-lin_scores) display_scores(lin_rmse_scores) #采用交叉验证来评估随机森林模型 from sklearn.ensemble import RandomForestRegressor forest_reg=RandomForestRegressor() forest_scores=cross_val_score(forest_reg,housing_prepared,housing_labels,scoring="neg_mean_squared_error",cv=10) forest_rmse_scores=np.sqrt(-forest_scores) display_scores(forest_rmse_scores) #采用网格搜索来寻找最优的随机森林模型超参 ''' from sklearn.model_selection import GridSearchCV param_grid=[ {"n_estimators":[3,10,30],"max_features":[2,4,6,8]}, {'bootstrap':[False],'n_estimators':[3,10],'max_features':[2,3,4]} ] grid_search=GridSearchCV(forest_reg,param_grid,cv=5,scoring="neg_mean_squared_error") grid_search.fit(housing_prepared,housing_labels) grid_search.best_params_ grid_search.best_estimator_ cvres=grid_search.cv_results_ for mean_score,params in zip(cvres["mean_test_score"],cvres["params"]): print(mean_score,params) grid_search.best_estimator_.feature_importances_ joblib.dump(grid_search.best_estimator_,"forest_reg_model_best.pkl") #保存最优的模型 ''' #在测试集测试最优的模型 final_model=joblib.load("forest_reg_model_best.pkl") x_test=strat_test_set.drop("median_house_value",axis=1) y_test=strat_test_set["median_house_value"].copy() x_test_prepared=full_pipeline.transform(x_test) x_test_prediction=final_model.predict(x_test_prepared) final_mse=mean_squared_error(x_test_prediction,y_test) final_rmse=np.sqrt(final_mse) print("final_rmse:",final_rmse) #采用网格搜索来寻找最优的支持向量机模型超参 from sklearn.model_selection import GridSearchCV from sklearn.svm import SVR param_grid=[ {"kernel":["linear"],"C":[10,1000,2000]}, {"kernel":["rbf"],"C":[10,1000,2000],"gamma":[0,0.1,0.2,0.4]} ] svr=SVR() param_grid_rand=[ {"kernel":["linear"],"C":[*range(10,1000,2000)]}, {"kernel":["rbf"],"C":[*range(10,1000,2000)],"gamma":[*range(0,1,2)]} ] grid_search=GridSearchCV(svr,param_grid_rand,cv=5,scoring="neg_mean_squared_error") grid_search.fit(housing_prepared,housing_labels) cvres=grid_search.cv_results_ for mean_score,params in zip(cvres["mean_test_score"],cvres["params"]): print(mean_score,params) final_model=grid_search.best_estimator_ x_test_prediction=final_model.predict(x_test_prepared) svr_final_mse=mean_squared_error(x_test_prediction,y_test) svr_final_rmse=np.sqrt(svr_final_mse) print("best_estimator:",grid_search.best_estimator_) print("svr_final_rmse:",svr_final_rmse) #采用随机搜索来寻找最优的支持向量机模型 from sklearn.model_selection import RandomizedSearchCV random_search=RandomizedSearchCV(svr,param_grid,n_iter=20,scoring="neg_mean_squared_error") random_search.fit(housing_prepared,housing_labels) cvres=random_search.cv_results_ for mean_score,params in zip(cvres["mean_test_score"],cvres["params"]): print(mean_score,params) final_model=random_search.best_estimator_ x_test_prediction=final_model.predict(x_test_prepared) svr_final_mse=mean_squared_error(x_test_prediction,y_test) svr_final_rmse=np.sqrt(svr_final_mse) print("rand_best_estimator:",final_model) print("rand_svr_final_rmse:",svr_final_rmse)