Logstic Regression 原理与代码

Logstic Regression 模型

由线性回归的知识可知,线性回归方程为:\[y = \theta_0 + \theta_1 x_1 + \cdots + \theta_n x_n = \Theta^T X\] 而Logstic Regression就是在线性回归方程的基础上加上sigmoid的函数:$ y = $ Logstic Regression方程如所示:\[ h_\theta (x) = \frac{1}{1 + e^{-\Theta^T X}} \] 所以\(h_\theta (x)\)的结果是(0,1)之间的值。当\(h_\theta (x)\)大于0.5时,我们可以判定为正例,即\(y=1\);当\(h_\theta (x)\)小于0.5时,我们可以判定为负例,即\(y=0\);当\(h_\theta (x)\)等于0.5时,可任意判断

损失函数

通过概率论的极大似然估计,我们可以得到:\[ max L(x; \Theta) = \prod p(y_i | x_i, \Theta)\] 两边取对数: \[ \log(L(x; \Theta)) = \sum \log(p(y_i | x_i, \Theta)) \] 由Lostic Regression的方程可得:\[ p(y_i | x_i, \Theta) = h_\theta(x_i)^{y_i} \cdot (1-h_\theta(x_i))^{1-y_i} \] 带入化简可得:\[ J(\Theta) = \log(L(x; \Theta)) = \sum y_i\log(h_\theta(x_i)) + (1-y_i)\log(1-h_\theta(x_i))\] 求导得:\[ \frac {\partial J(\Theta)}{\partial \theta} = \sum (y-h_\theta(x))x \]

ps: 这符号好难打,再也不想写了

实现代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import numpy as np
import matplotlib.pyplot as plt

def loadData():
dataMat = []
labelMat = []
fr = open('dataSet/testSet.txt')
for line in fr.readlines():
lineArr = line.strip().split()
dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])
labelMat.append(int(lineArr[2]))
return dataMat, labelMat

def sigmoid(x):
return 1.0 / (1 + np.exp(-x))

def GradientDecent0(data, label):
dataMatrix = np.mat(data)
labelMatrix = np.mat(label).transpose()
m,n = np.shape(dataMatrix)
alpha = 0.001
maxCycles = 500
weights = np.ones((n,1))
for k in range(maxCycles):
h = sigmoid(dataMatrix * weights)
error = labelMatrix - h
weights = weights + alpha * dataMatrix.transpose() * error
return weights

def GradientDecent1(data, label):
data = np.array(data)
m, n = np.shape(data)
alpha = 0.01
w = np.ones((n,1))
for j in range(500):
for i in range(m):
h = sigmoid(np.dot(data[i], w))
error = label[i] - h
w = w + alpha * error * np.transpose([data[i]])
return w

def GradientDecent2(data, label, num_iter=500):
m, n = np.shape(data)
w = np.ones((n,1))
for i in range(num_iter):
index = list(range(m))
for j in range(m):
alpha = 4 / (1.0 + i + j) + 0.01
id = int(np.random.uniform(0, len(index)))
h = sigmoid(np.dot(data[id], w))
error = label[id] - h
w = w + alpha * error * np.transpose([data[id]])
del index[id]
return w

def plotBestFit(weights):
data, label = loadData()
dataArr =np.array(data)
n = np.shape(dataArr)[0]
xcord1, ycord1 = [], []
xcord2, ycord2 = [], []
for i in range(n):
if int(label[i]) == 1:
xcord1.append(dataArr[i,1])
ycord1.append(dataArr[i,2])
else:
xcord2.append(dataArr[i,1])
ycord2.append(dataArr[i,2])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
ax.scatter(xcord2, ycord2, s=30, c='green')
x = np.arange(-3.0, 3.0, 0.1)
y = (-weights[0] - weights[1] * x) / weights[2]
ax.plot(x,y)
plt.xlabel('X1'); plt.ylabel('X2')
plt.show()

def prediction(data, w):
prob = sigmoid(np.dot(data, w))
if prob[0] >= 0.5:
return 1
else:
return 0

def dataClean():
fr_train = open('dataSet/horse_train.txt')
fr_test = open('dataSet/horse_test.txt')
train_data, train_label = [], []
for line in fr_train.readlines():
ln = line.strip().split('\t')
lnArr = [float(ln[i]) for i in range(21)]
train_data.append(lnArr)
train_label.append(float(ln[-1]))
w = GradientDecent0(train_data, train_label)
print(w)
cnt_error, num_data = 0, 0
for line in fr_test.readlines():
num_data += 1
ln = line.strip().split('\t')
lnArr = [float(ln[i]) for i in range(21)]
if prediction(lnArr, w) != float(ln[-1]):
cnt_error += 1
error_rate = cnt_error * 1.0 / num_data
print("error_rate: %f" % error_rate)
return error_rate

dataClean()