# 逻辑回归

sigomoid function:

## 训练算法：求参数

• 将数据转化成矩阵
• 求梯度
from numpy import *
'''
读取文件
'''
datamat = []; labelmat = []
with open(filename) as f:
line_arr = line.strip().split()
#有1.0是因为有一个常数项
datamat.append([1.0, float(line_arr[0]), float(line_arr[1])])
labelmat.append(int(line_arr[2]))
return datamat, labelmat
def sigmoid(inX):
'''
sigmoid函数
-inX:-wT·x
'''
return 1.0/(1+exp(-inX))
'''
梯度上升算法
dataMatIn:训练样本
classLabels:标签
'''
dataMatrix = mat(dataMatIn)
labelMat = mat(classLabels).transpose()
m,n = shape(dataMatrix)
alpha = 0.001
maxCycles = 500
#weights are parameters' vector
weights = ones((n,1))
for k in range(maxCycles):
h = sigmoid(dataMatrix*weights)
error = (labelMat - h)
#推导过程如下:
weights = weights +alpha*dataMatrix.transpose()*error
return weights

import matplotlib
import matplotlib.pyplot as plt
from numpy import *

def plot_fit(data, labelMat, weights):
'''
画图，得到分界线
data:训练样本
labelMat:标签矩阵
'''
dataArr = array(data)
n = shape(dataArr)[0]

x_cord1 = []; y_cord1 = []
x_cord2 = []; y_cord2 = []
for i in range(n):
if int(labelMat[i]) == 1:
x_cord1.append(dataArr[i,1]); y_cord1.append(dataArr[i,2])
else: x_cord2.append(dataArr[i,1]); y_cord2.append(dataArr[i,2])

fig = plt.figure()
ax.scatter(x_cord1, y_cord1, s = 30, c = 'red', marker='s')
ax.scatter(x_cord2, y_cord2, s = 30, c = 'green')

x = arange(-3.0, 3.0, 0.1)
y = ((-weights[0]- weights[1] * x)/weights[2]).transpose()
ax.plot(x, y)
plt.xlabel('X1');
plt.ylabel('X2');
plt.show()
plot_fit(datamat,labelmat,weights)
image.png

## 随机梯度上升

'''
随机梯度上升算法
dataMatrix:训练样本
calssLabels:标签
output:参数值
'''
m,n = shape(dataMatrix)
alpha = 0.01
weights = ones(n)
for i in range(m):
h = sigmoid(sum(dataMatrix[i]*weights))
error = classLabels[i] - h
weights = weights + alpha*error*array(dataMatrix[I])
return weights

plot_fit(datamat,labelmat,weight2)
image.png

'''
改进的随机梯度上升算法
'''
m,n = shape(dataMatrix)
weights = ones(n)
#迭代次数
for j in range(numIter):
dataIndex = range(m)
#样本数量
for i in range(m):
#每次调整a，why?
alpha = 0.01+4/(1.0+j+i)
#uniform-->生成[x,y)之间的随机实数
randIndex = int(random.uniform(0,len(dataIndex)))
h = sigmoid(sum(dataMatrix[randIndex]*weights))
error = classLabels[randIndex] -h
weights = weights + alpha*error*array(dataMatrix[randIndex])
return weights
'''
1.不明白为什么每次需要调整alpha
2.随机选择样本可以减少周期波动
'''
plot_fit(datamat,labelmat,weights3)
image.png

## 疝气病预测病马死亡率

• 用特征均值填补
• 用特殊值填补，比如-1
• 忽略这部分样本
• 用类似样本均值填补
• 用机器学习算法来预测

1. 选一个实数值替换，选0
2. 标签缺失，需要丢弃数据

def getDateSet(filepath):
'''
获得训练集和测试集
'''
import numpy as np

with open(filepath) as f:
arr = []
dataSet = [];Labels=[]#训练集和标签集
for x in file:
mid = x.strip().split('\t')
vector = list(map(lambda x:float(x),mid))
arr.append(vector)
#用numpy提取数据集以及标签集,加一个x0项
mid2 = np.array(arr)
x0 = array([1]*len(file))
dataSet = np.column_stack((x0,mid2[:,:-1]))
Labels =mid2[:,-1]
return dataSet,Labels
trainingdataSet,trainingLabels = getDateSet('./horseColicTraining.txt')
testdataSet,testLabels = getDateSet('./horseColicTest.txt')
array([ 2.88523241e+00,  4.60371241e+01, -3.44351209e+00,  1.81776542e+00,
-1.22312262e+00, -4.51162382e-02, -1.26815180e+01, -5.19715412e+00,
-1.62249176e+01,  1.69815852e+00, -1.42154178e+01,  2.51022948e+01,
-1.01675395e+01,  3.62858014e+01, -5.73351142e+00, -7.52889830e+00,
8.52370700e+00, -9.93327494e+00, -1.10860304e+00,  1.43608141e+00,
-2.06483905e+00, -5.98332914e+00])
#得到参数
#测试，计算w^tx
result = mat(testdataSet)*mat(weights).T
#sigmoid参数z,是-w^Tx的计算结果，sig
z = result.tolist()
sig = list(map(lambda x:sigmoid(x[0]),result.tolist()))

def classifyVector(x):
'''
sigmoid函数计算结果
'''
if x >0.5:
return 1.0
else:
return 0.0
#逻辑回归的预测结果
lg_result = [classifyVector(x) for x in sig]
#标签集转列表
testLabels.tolist()
#将预测结果和真实结果zip打包成元组n
zipped = zip(lg_result,testLabels.tolist())
n = list(zipped)
#计算分类错误率
errorRate = len([x for x in n if x[0]-x[1]!=0])/len(n)

def errorTest():
'''
错误率计算函数
weights:逻辑回归模型参数
'''
#数据集准备
trainingdataSet,trainingLabels = getDateSet('./horseColicTraining.txt')
testdataSet,testLabels = getDateSet('./horseColicTest.txt')

#得到参数
#测试，计算w^tx
result = mat(testdataSet)*mat(weights).T

#sigmoid参数z,是-w^Tx的计算结果，sig
z = result.tolist()
sig = list(map(lambda x:sigmoid(x[0]),result.tolist()))

#逻辑回归的预测结果
lg_result = [classifyVector(x) for x in sig]
#标签集转列表
testLabels.tolist()
#将预测结果和真实结果zip打包成元组n
zipped = zip(lg_result,testLabels.tolist())
n = list(zipped)
#计算分类错误率
errorRate = len([x for x in n if x[0]-x[1]!=0])/len(n)
print('错误率:%f'%errorRate)
return errorRate
errorTest()

0.2835820895522388
def multiTest():
numTests = 10;errorSum=0.0
for k in range(numTests):
errorSum += errorTest()
print('经过10次迭代后平均错误率:%f'%(float(errorSum/numTests)))
multiTest()

## 代码整理

def sigmoid(inX):
'''
sigmoid函数
-inX:-wT·x
'''
return 1.0/(1+exp(-inX))

'''
改进的随机梯度上升算法
'''
m,n = shape(dataMatrix)
weights = ones(n)
#迭代次数
for j in range(numIter):
dataIndex = range(m)
#样本数量
for i in range(m):
#每次调整a，why?
alpha = 0.01+4/(1.0+j+i)
#uniform-->生成[x,y)之间的随机实数
randIndex = int(random.uniform(0,len(dataIndex)))
h = sigmoid(sum(dataMatrix[randIndex]*weights))
error = classLabels[randIndex] -h
weights = weights + alpha*error*array(dataMatrix[randIndex])
return weights

def getDateSet(filepath):
'''
获得训练集和测试集
'''
import numpy as np

with open(filepath) as f:
arr = []
dataSet = [];Labels=[]#训练集和标签集
for x in file:
mid = x.strip().split('\t')
vector = list(map(lambda x:float(x),mid))
arr.append(vector)
#用numpy提取数据集以及标签集,加一个x0项
mid2 = np.array(arr)
x0 = array([1]*len(file))
dataSet = np.column_stack((x0,mid2[:,:-1]))
Labels =mid2[:,-1]
return dataSet,Labels

def classifyVector(x):
'''
sigmoid函数计算结果分类
'''
if x >0.5:
return 1.0
else:
return 0.0

if __name__ =="__main__":
multiTest()