机器学习之线性回归_覃秉丰——源码

一元线性回归

import numpy as np
from matplotlib import pyplot as plt
# 读取数据
data = np.genfromtxt('data.csv', delimiter=',')
x_data = data[:, 0]
y_data = data[:, 1]
# plt.scatter(x_data, y_data)
# plt.show()
# 学习率 learning rate
lr = 0.0001
# 斜率
k = -2
# 截距
b = -2
# 最大迭代次数
epochs = 500


# 最小二乘法
# #计算损失函数
def compute_loss(x_data, y_data, k, b):
    total_Error = 0
    for i in range(0, len(x_data)):
        total_Error += (y_data[i] - (k * x_data[i] + b)) ** 2
    return total_Error / (2.0 * len(x_data))


# 进行梯度下降
def gradient(x_data, y_data, k, b, lr, epochs):
    m = float(len(x_data))

    for i in range(0, epochs):
        k_gradient = 0
        b_gradient = 0
        for j in range(0, len(x_data)):
            b_gradient += (1 / m) * ((x_data[j] * k + b) - y_data[j])
            k_gradient += (1 / m) * ((x_data[j] * k + b) - y_data[j]) * x_data[j]
        k -= lr * k_gradient
        b -= lr * b_gradient

        # if i % 50 == 0:
        #     print(i)
        #     plt.plot(x_data, y_data, 'b.')
        #     plt.plot(x_data, k * x_data + b, 'r')
        #     plt.show()

    return k, b
print('starting k = {0} ,b = {1} ,error = {2} '.format(k , b , compute_loss(x_data,y_data,k,b)))
k, b = gradient(x_data, y_data,k, b, lr, epochs)
plt.plot(x_data, k * x_data + b, 'r')
plt.plot(x_data, y_data, 'b.')
print('loss =:', compute_loss(x_data, y_data, k, b), 'b =:', b, 'k =:', k)
plt.show()

使用sklearn的一元线性回归

import numpy as np
from matplotlib import  pyplot as plt
from sklearn.linear_model import LinearRegression
#读取数据
data = np.genfromtxt(r'data.csv', delimiter=',')
x_data = data[:, 0]
y_data = data[:, 1]
print(x_data)
# plt.scatter(x_data, y_data)
# plt.show()
# 使一维数据编程二维数据
x_data = data[:, 0, np.newaxis]
y_data = data[:, 1, np.newaxis]
# print(x_data)
# 创建模型
model =LinearRegression()

model.fit(x_data, y_data) # 传进的参数必须是二维的
plt.plot(x_data, y_data, 'b.')
plt.plot(x_data, model.predict(x_data), 'r')#画出预测的线条
plt.show()

　　多元线性回归

import numpy as np
from numpy import genfromtxt
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D #用来画3D图的包
# 读入数据
data = genfromtxt(r"Delivery.csv",delimiter=',')
print(data)
# 切分数据
x_data = data[:,:-1]
y_data = data[:,-1]
print(x_data)
print(y_data)
# 学习率learning rate
lr = 0.0001
# 参数
theta0 = 0
theta1 = 0
theta2 = 0
# 最大迭代次数
epochs = 1000

# 最小二乘法
def compute_error(theta0, theta1, theta2, x_data, y_data):
    totalError = 0
    for i in range(0, len(x_data)):
        totalError += (y_data[i] - (theta0 + theta1 * x_data[i,0] + theta2*x_data[i,1])) ** 2
    return totalError / float(len(x_data))

# 求梯度
def gradient_descent_runner(x_data, y_data, theta0, theta1, theta2, lr, epochs):
    # 计算总数据量
    m = float(len(x_data))
    # 循环epochs次
    for i in range(epochs):
        theta0_grad = 0
        theta1_grad = 0
        theta2_grad = 0
        # 计算梯度的总和再求平均
        for j in range(0, len(x_data)):
            # 多远线性回归公式
            theta0_grad += (1/m) * ((theta1 * x_data[j,0] + theta2*x_data[j,1] + theta0) - y_data[j])
            theta1_grad += (1/m) * x_data[j,0] * ((theta1 * x_data[j,0] + theta2*x_data[j,1] + theta0) - y_data[j])
            theta2_grad += (1/m) * x_data[j,1] * ((theta1 * x_data[j,0] + theta2*x_data[j,1] + theta0) - y_data[j])
        # 更新b和k
        theta0 -= lr*theta0_grad
        theta1 -= lr*theta1_grad
        theta2 -= lr*theta2_grad
    return theta0, theta1, theta2
print("Starting theta0 = {0}, theta1 = {1}, theta2 = {2}, error = {3}".
      format(theta0, theta1, theta2, compute_error(theta0, theta1, theta2, x_data, y_data)))
print("Running...")
theta0, theta1, theta2 = gradient_descent_runner(x_data, y_data, theta0, theta1, theta2, lr, epochs)
print("After {0} iterations theta0 = {1}, theta1 = {2}, theta2 = {3}, error = {4}".
      format(epochs, theta0, theta1, theta2, compute_error(theta0, theta1, theta2, x_data, y_data)))

# #plt.figure().add_subplot和plt.subplot的作用是一致的

# ax = Axes3D(plt.figure())#和下面的代码功能一样
ax = plt.figure().add_subplot(111, projection='3d')
ax.scatter(x_data[:, 0], x_data[:, 1], y_data, c='r', marker='o', s=100)  # 点为红色三角形
x0 = x_data[:, 0]
x1 = x_data[:, 1]

# 生成网格矩阵
x0, x1 = np.meshgrid(x0, x1)#生成一个网格矩阵，矩阵的每个点的第一个轴的取值来自于x0范围内，第二个坐标轴的取值来自于x1范围内
z = theta0 + x0 * theta1 + x1 * theta2
# 画3D图
ax.plot_surface(x0, x1, z)
# 设置坐标轴
ax.set_xlabel('Miles')
ax.set_ylabel('Num of Deliveries')
ax.set_zlabel('Time')

# 显示图像
plt.show()

使用sklearn的多元线性回归

import numpy as np
from numpy import genfromtxt
from sklearn import linear_model
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
# sklearn里面用的是标准方程法，不是最小二乘，所以用sklearn的结果与multi_liner不一样
# 读入数据
data = genfromtxt(r"Delivery.csv",delimiter=',')
print(data)
# 切分数据
x_data = data[:,:-1]
y_data = data[:,-1]
print(x_data)
print(y_data)
# 创建模型
model = linear_model.LinearRegression()
model.fit(x_data, y_data)
# 系数
print("coefficients:",model.coef_)

# 截距
print("intercept:",model.intercept_)

# 测试
x_test = [[102, 4]]
predict = model.predict(x_test)
print("predict:", predict)
ax = plt.figure().add_subplot(111, projection='3d')
ax.scatter(x_data[:, 0], x_data[:, 1], y_data, c='r', marker='o', s=100)  # 点为红色三角形
x0 = x_data[:, 0]
x1 = x_data[:, 1]
# 生成网格矩阵
x0, x1 = np.meshgrid(x0, x1)
z = model.intercept_ + x0*model.coef_[0] + x1*model.coef_[1]
# 画3D图
ax.plot_surface(x0, x1, z)#参数是二维的，而model.prodict(x_data)是一维的。
# 设置坐标轴
ax.set_xlabel('Miles')
ax.set_ylabel('Num of Deliveries')
ax.set_zlabel('Time')

# 显示图像
plt.show()

最后的多项式

import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures#多项式
from sklearn.linear_model import LinearRegression

# 载入数据
data = np.genfromtxt(r"job.csv", delimiter=",")
x_data = data[1:,1]
y_data = data[1:,2]
plt.scatter(x_data,y_data)
plt.show()
x_data
x_data = x_data[:,np.newaxis]
y_data = y_data[:,np.newaxis]
x_data
# 创建并拟合模型
model = LinearRegression()
model.fit(x_data, y_data)
# 画图
plt.plot(x_data, y_data, 'b.')
plt.plot(x_data, model.predict(x_data), 'r')
plt.show()
# 定义多项式回归,degree的值可以调节多项式的特征
poly_reg  = PolynomialFeatures(degree=5)
# 特征处理
x_poly = poly_reg.fit_transform(x_data)
# 定义回归模型
lin_reg = LinearRegression()
# 训练模型
lin_reg.fit(x_poly, y_data)
# 画图
plt.plot(x_data, y_data, 'b.')
plt.plot(x_data, lin_reg.predict(poly_reg.fit_transform(x_data)), c='r')
plt.title('Truth or Bluff (Polynomial Regression)')
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()
# 画图
plt.plot(x_data, y_data, 'b.')
x_test = np.linspace(1,10,100)
x_test = x_test[:,np.newaxis]
plt.plot(x_test, lin_reg.predict(poly_reg.fit_transform(x_test)), c='r')
plt.title('Truth or Bluff (Polynomial Regression)')
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()

来源：oschina

链接：https://my.oschina.net/u/4369994/blog/4301740

标签

XData