# sklearn逻辑回归案例分析 《良/恶性乳腺癌肿瘤预测》

``````import pandas as pd
import numpy as np

column_names = ['Sample code number','Clump Thickness','Uniformity of Cell Size','Uniformity of Cell Shape','Marginal Adhesion','Single Epithelial Cell Size','Bare Nuclei','Bland Chromatin','Normal Nucleoli','Mitoses','Class']

data = data.replace(to_replace='?',value=np.nan)    #非法字符的替代
data = data.dropna(how='any')        #去掉空值，any：出现空值行则删除
print(data.shape)

``````from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test = train_test_split(data[column_names[1:10]],data[column_names[10]],test_size=0.25,random_state=5)``````

``````from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

ss = StandardScaler()   #标准化数据，保证每个维度的特征数据方差为1，均值为0.使得预测结果不会被某些维度过大的特征值主导
X_train = ss.fit_transform(X_train)    #先拟合，在转换
X_test = ss.transform(X_test)        #上面拟合过，这里直接转换

lr = LogisticRegression(C=1.0,penalty='l1',tol=0.1)    #调用逻辑回归模型，里面的参数可以自己设置，通过交叉验证来判断最优参数，我前面文章有介绍
lr.fit(X_train,y_train)    #使用上面标准化后的数据来训练
lr_predict = lr.predict(X_test)       #得到预测结果``````

``````from sklearn.metrics import accuracy_score
print("预测结果为：{}".format(lr.score(X_test,y_test)))
print("预测结果为：{}".format(accuracy_score(y_test,lr_predict)))``````