问题
I am trying to implement from scratch the multiclass logistic regression but my implementation returns bad results. I believe the definition of the gradient function and the cost function is fine. Maybe there is a problem with how these functions are interacting with the minimize
function. I have tried it but I could not find out what is wrong. Could you please cast some light?
You can add the estimator 'myLR': myLR(**par_dict),
with paramters
par_dict= {'alpha': 0.1, 'maxit': 2000, 'opt_method': 'bfgs', 'positive': False, 'penalty': None, 'verbose': True, 'seed': 3}
in this example or in any of these examples to test it.
import numpy as np
from scipy.optimize import minimize
from sklearn import preprocessing
class myLR():
def __init__(self, alpha=0.1, reltol=1e-8, maxit=1000, opt_method=None, verbose=True, seed=0):
self.alpha = alpha
self.maxit = maxit
self.reltol = reltol
self.seed = seed
self.verbose = verbose
self.opt_method = opt_method
self.lbin = preprocessing.LabelBinarizer()
def w_2d(self, w, n_classes):
return np.reshape(w, (-1, n_classes), order='F')
def softmax(self, W, X):
a = np.exp(X @ W)
o = a / np.sum(a, axis=1, keepdims=True)
return o
def cost_wraper(self, W):
return self.cost(W, self.X, self.T, self.n_samples, self.n_classes)
def cost(self, W, X, T, n_samples, n_classes):
W = self.w_2d(W, n_classes)
log_O = np.log(self.softmax(W, X))
reg = self.apha * np.linalg.norm(W, ord='fro')
c = -np.sum([np.vdot(T[[i]], log_O[[i]]) for i in range(n_samples)]) / n_samples + reg
return c
def gradient_wraper(self, W):
return self.gradient(W, self.X, self.T, self.n_samples, self.n_classes)
def gradient(self, W, X, T, n_samples, n_classes):
W = self.w_2d(W, n_classes)
O = self.softmax(W, X)
reg = self.alpha * W
grad = -X.T.dot(T - O) / n_samples + reg
return grad.flatten()
def fit(self, X, y=None):
self.n_classes = len(np.unique(y))
self.n_samples, n_features = X.shape
if self.n_classes == 2:
self.T = np.zeros((self.n_samples, self.n_classes), dtype=np.float64)
for i, cls in enumerate(range(self.n_classes)):
self.T[y == cls, i] = 1
else:
self.T = self.lbin.fit_transform(y)
self.X = X
np.random.seed(self.seed)
W_0 = np.random.random(n_features * self.n_classes)
options = {'disp': self.verbose, 'maxiter': self.maxit}
f_min = minimize(fun=self.cost_wraper, x0=W_0,
method=self.opt_method,
jac=self.gradient_wraper,
options=options)
self.coef_ = self.w_2d(f_min.x, self.n_classes)
self.W_ = self.coef_
return self
def predict_proba(self, X):
O = self.softmax(self.coef_, X)
return O
def predict(self, X):
sigma = self.predict_proba(X)
y_pred = np.argmax(sigma, axis=1)
return y_pred
Edit: Regularization term is included.
回答1:
I think it is now working with the following code.
import numpy as np
from scipy.optimize import minimize
from sklearn import preprocessing
class myLR():
def __init__(self, reltol=1e-8, maxit=1000, opt_method=None, verbose=True, seed=0):
self.maxit = maxit
self.reltol = reltol
self.seed = seed
self.verbose = verbose
self.opt_method = opt_method
self.lbin = preprocessing.LabelBinarizer()
def w_2d(self, w, n_classes):
return np.reshape(w, (n_classes, -1))
def softmax(self, W, X):
a = np.exp(X @ W.T)
o = a / np.sum(a, axis=1, keepdims=True)
return o
def squared_norm(self, x):
x = np.ravel(x, order='K')
return np.dot(x, x)
def cost(self, W, X, T, n_samples, n_classes):
W = self.w_2d(W, n_classes)
log_O = np.log(self.softmax(W, X))
c = -(T * log_O).sum()
return c / n_samples
def gradient(self, W, X, T, n_samples, n_classes):
W = self.w_2d(W, n_classes)
O = self.softmax(W, X)
grad = -(T - O).T.dot(X)
return grad.ravel() / n_samples
def fit(self, X, y=None):
n_classes = len(np.unique(y))
n_samples, n_features = X.shape
if n_classes == 2:
T = np.zeros((n_samples, n_classes), dtype=np.float64)
for i, cls in enumerate(np.unique(y)):
T[y == cls, i] = 1
else:
T = self.lbin.fit_transform(y)
np.random.seed(self.seed)
W_0 = np.random.random((self.n_classes, self.n_features))
options = {'disp': self.verbose, 'maxiter': self.maxit}
f_min = minimize(fun=self.cost, x0=W_0,
args=(X, T, n_samples, n_classes),
method=self.opt_method,
jac=self.gradient,
options=options)
self.coef_ = self.w_2d(f_min.x, n_classes)
self.W_ = self.coef_
return self
def predict_proba(self, X):
O = self.softmax(self.W_, X)
return O
def predict(self, X):
sigma = self.predict_proba(X)
y_pred = np.argmax(sigma, axis=1)
return y_pred
来源:https://stackoverflow.com/questions/65765309/multi-class-logistic-regression-from-scratch