从零开始:用NumPy手搓一个多层感知机(MLP),并和PyTorch结果对齐
从零构建MLP用NumPy实现与PyTorch对齐的神经网络训练全流程在深度学习框架高度封装的今天许多开发者已经习惯了调用现成的API搭建神经网络。但当你真正用NumPy从零实现一个多层感知机MLP并与PyTorch的结果进行严格对齐时会发现其中蕴含着对神经网络本质的深刻理解。本文将带你完整实现这一过程涵盖以下关键环节数学原理的代码化实现手动推导并编码前向传播、反向传播数值稳定性技巧处理梯度消失/爆炸的实际解决方案框架级功能复现实现PyTorch风格的自动微分机制严格验证体系建立与PyTorch结果的逐层对比测试1. 神经网络核心组件实现1.1 张量运算基础架构我们先构建最基础的张量操作类这是后续所有组件的基础class Tensor: def __init__(self, data, requires_gradFalse): self.data np.array(data, dtypenp.float32) self.grad None self.requires_grad requires_grad self._grad_fn None def __matmul__(self, other): return MatMul()(self, other) def __add__(self, other): return Add()(self, other) def __sub__(self, other): return Sub()(self, other) def backward(self, gradNone): if not self.requires_grad: return if grad is None: grad np.ones_like(self.data) if self._grad_fn is not None: self._grad_fn.backward(grad)1.2 自动微分系统实现实现类似PyTorch的自动微分机制需要构建运算图的节点关系class Function: def __call__(self, *inputs): self.inputs inputs output self.forward(*[x.data for x in inputs]) output Tensor(output) if any(x.requires_grad for x in inputs): output.requires_grad True output._grad_fn self return output def forward(self, *inputs): raise NotImplementedError def backward(self, grad): raise NotImplementedError class MatMul(Function): def forward(self, x, w): return x w def backward(self, grad): x, w self.inputs if x.requires_grad: x.grad grad w.data.T if w.requires_grad: w.grad x.data.T grad2. 完整MLP网络实现2.1 网络层抽象设计我们采用模块化设计使各层可以灵活组合class Module: def parameters(self): return [] def zero_grad(self): for p in self.parameters(): p.grad None def forward(self, x): raise NotImplementedError def __call__(self, x): return self.forward(x) class Linear(Module): def __init__(self, in_features, out_features): self.weight Tensor( np.random.randn(in_features, out_features) * np.sqrt(2./in_features), requires_gradTrue ) self.bias Tensor(np.zeros(out_features), requires_gradTrue) def forward(self, x): return x self.weight self.bias def parameters(self): return [self.weight, self.bias]2.2 激活函数实现实现常用激活函数及其梯度class ReLU(Function): def forward(self, x): self.mask x 0 return x * self.mask def backward(self, grad): return grad * self.mask def relu(x): return ReLU()(x) class Sigmoid(Function): def forward(self, x): self.out 1 / (1 np.exp(-x)) return self.out def backward(self, grad): return grad * self.out * (1 - self.out) def sigmoid(x): return Sigmoid()(x)3. 训练流程与优化器实现3.1 损失函数实现实现两种常用损失函数class MSELoss(Function): def forward(self, pred, target): self.diff pred - target return np.mean(self.diff**2) def backward(self, grad): return 2 * self.diff / np.prod(self.diff.shape) class CrossEntropyLoss(Function): def forward(self, x, target): exp np.exp(x - np.max(x, axis1, keepdimsTrue)) self.probs exp / np.sum(exp, axis1, keepdimsTrue) return -np.mean(np.log(self.probs[np.arange(len(target)), target.argmax(axis1)])) def backward(self, grad): grad_input self.probs.copy() grad_input[np.arange(len(self.inputs[1])), self.inputs[1].argmax(axis1)] - 1 return grad_input / len(self.inputs[1])3.2 优化器实现实现SGD优化器class SGD: def __init__(self, params, lr0.01): self.params params self.lr lr def step(self): for p in self.params: if p.grad is not None: p.data - self.lr * p.grad def zero_grad(self): for p in self.params: p.grad None4. 与PyTorch的严格对齐验证4.1 数值一致性测试方法为确保我们的实现与PyTorch完全一致需要建立严格的测试体系参数初始化对齐使用相同的随机种子初始化权重前向传播验证逐层比较输出值的数值差异梯度计算验证比较反向传播的梯度值训练过程验证比较多个epoch后的参数变化def test_forward_consistency(): # 初始化相同权重的网络 np.random.seed(42) our_net MLP([784, 128, 10]) torch.manual_seed(42) torch_net TorchMLP([784, 128, 10]) # 相同输入数据 x np.random.randn(32, 784) our_out our_net(Tensor(x)) torch_out torch_net(torch.tensor(x, dtypetorch.float32)) # 比较输出差异 max_diff np.max(np.abs(our_out.data - torch_out.detach().numpy())) print(f前向传播最大差异: {max_diff:.6f}) assert max_diff 1e-64.2 梯度计算验证梯度计算是神经网络的核心必须确保完全一致def test_backward_consistency(): # 初始化网络和相同输入数据 np.random.seed(42) our_net MLP([784, 128, 10]) torch_net TorchMLP([784, 128, 10]) x np.random.randn(32, 784) y np.random.randint(0, 10, size32) # 我们的实现 our_out our_net(Tensor(x)) loss CrossEntropyLoss()(our_out, Tensor(np.eye(10)[y])) loss.backward() # PyTorch实现 torch_out torch_net(torch.tensor(x, dtypetorch.float32)) torch_loss F.cross_entropy(torch_out, torch.tensor(y)) torch_loss.backward() # 比较梯度差异 for our_p, torch_p in zip(our_net.parameters(), torch_net.parameters()): max_diff np.max(np.abs(our_p.grad - torch_p.grad.detach().numpy())) print(f参数梯度最大差异: {max_diff:.6f}) assert max_diff 1e-65. 实际训练中的关键技巧5.1 学习率调度策略实现PyTorch风格的学习率调度class StepLR: def __init__(self, optimizer, step_size, gamma0.1): self.optimizer optimizer self.step_size step_size self.gamma gamma self.last_epoch 0 def step(self): self.last_epoch 1 if self.last_epoch % self.step_size 0: self.optimizer.lr * self.gamma class ExponentialLR: def __init__(self, optimizer, gamma): self.optimizer optimizer self.gamma gamma def step(self): self.optimizer.lr * self.gamma5.2 批归一化实现批归一化对训练深度网络至关重要class BatchNorm(Module): def __init__(self, num_features, eps1e-5, momentum0.1): self.gamma Tensor(np.ones(num_features), requires_gradTrue) self.beta Tensor(np.zeros(num_features), requires_gradTrue) self.eps eps self.momentum momentum self.running_mean np.zeros(num_features) self.running_var np.ones(num_features) def forward(self, x): if self.training: mean x.data.mean(axis0) var x.data.var(axis0) self.running_mean (1-self.momentum)*self.running_mean self.momentum*mean self.running_var (1-self.momentum)*self.running_var self.momentum*var x_norm (x.data - mean) / np.sqrt(var self.eps) else: x_norm (x.data - self.running_mean) / np.sqrt(self.running_var self.eps) return x_norm * self.gamma self.beta def parameters(self): return [self.gamma, self.beta]6. 完整训练流程示例下面展示如何在MNIST数据集上训练我们的NumPy MLPdef train_mnist(): # 数据准备 train_data, train_labels load_mnist(train) test_data, test_labels load_mnist(test) # 模型初始化 model Sequential( Linear(784, 256), ReLU(), Linear(256, 64), ReLU(), Linear(64, 10) ) # 优化器和损失函数 optimizer SGD(model.parameters(), lr0.1) scheduler StepLR(optimizer, step_size30, gamma0.1) criterion CrossEntropyLoss() # 训练循环 batch_size 64 for epoch in range(100): model.train() for i in range(0, len(train_data), batch_size): x Tensor(train_data[i:ibatch_size]) y Tensor(np.eye(10)[train_labels[i:ibatch_size]]) optimizer.zero_grad() output model(x) loss criterion(output, y) loss.backward() optimizer.step() scheduler.step() # 验证 model.eval() correct 0 for i in range(0, len(test_data), batch_size): x Tensor(test_data[i:ibatch_size]) pred model(x).data.argmax(axis1) correct (pred test_labels[i:ibatch_size]).sum() print(fEpoch {epoch}: Accuracy {correct/len(test_data):.4f})
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.coloradmin.cn/o/2497418.html
如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈,一经查实,立即删除!