构建自动求导引擎tinytorch 02
社区文章 发布于2024年1月30日
我们来编写反向传播
反向传播很简单,几乎不麻烦。步骤:反向传播超级简单,几乎不麻烦。步骤
- 任何事物对自身的导数都是1
- 小操作的反向传播可以硬编码
- 对于多变量函数f(x,y),分别将x和y视为常数,使函数变为fx(y)和fy(x)
- op是一个节点,将存储中间变量
- op.backward 我们将得到2个张量梯度
- 将梯度设置给每个节点
现在,这里是最简单的反向传播
def backward(self,grad=None):
if grad is None:
grad = Tensor([1.])
self.grad = grad
op = self._ctx.op
child_nodes = self._ctx.args
grads = op.backward(self._ctx,grad)
for tensor,grad in zip(child_nodes,grads):
tensor.grad = grad
我们将梯度作为输入,默认为None,如果梯度值为None,我们将它设置为1。然后将梯度传递给op.backward,它会给我们2个梯度。将每个梯度分配给子节点。
我们来测试一下
if __name__ == "__main__":
x = Tensor([8])
y = Tensor([5])
print("Add")
z = x + y
print(z)
z.backward()
print(f"x: {x} , grad {x.grad}")
print(f"y: {y} , grad {y.grad}")
print("="*100)
print("Mul")
z = x * y
print(z)
z.backward()
print(f"x: {x} , grad {x.grad}")
print(f"y: {y} , grad {y.grad}")
print("="*100)
输出应该像这样
~/workspace/tinytorch master
❯ /home/joey/miniforge3/bin/python /home/joey/workspace/tinytorch/tinytorch.py
Add
tensor([13])
x: tensor([8]) , grad tensor([1])
y: tensor([5]) , grad tensor([1])
=======================================================
tensor([40])
x: tensor([8]) , grad tensor([5])
y: tensor([5]) , grad tensor([8])
=======================================================
提交 c5a9452c0ef0e23363d84cd44cacd362d53f398b
连接图
现在我们可以得到+和*的梯度,让我们将其扩展到复合函数,假设函数 (f(x) = x^3 + 2 \cdot x) 可以分解为3个步骤
这在图中会是这个样子
所以现在我们需要将梯度传递给之前的计算图,让我们修改反向传播方法
def backward(self,grad=None):
if self._ctx is None:
return
if grad is None:
grad = Tensor([1.])
self.grad = grad
op = self._ctx.op
child_nodes = self._ctx.args
grads = op.backward(self._ctx,grad)
for tensor,grad in zip(child_nodes,grads):
if tensor.grad is None:
tensor.grad = Tensor(np.zeros_like(self.data))
tensor.grad += grad
tensor.backward(grad)
如果`self._ctx`为None,我们在此处提前返回,这意味着我们已经到达图的末端。现在我们不再设置梯度,而是用零初始化它(形状与原始张量相同)。然后我们将其梯度加上去。`tensor.grad += grad` 我们在这里累积梯度。
我们还需要改变 Add 和 Mul 方法以使用上游梯度
class Add:
@staticmethod
def forward(x, y):
return Tensor(x.data + y.data)
@staticmethod
def backward(ctx, grad):
x, y = ctx.args
return Tensor([1])*grad, Tensor([1]) *grad
class Mul:
@staticmethod
def forward(x, y):
return Tensor(x.data * y.data) # z = x*y
@staticmethod
def backward(ctx, grad):
x, y = ctx.args
return Tensor(y.data)*grad, Tensor(x.data)*grad # dz/dx, dz/dy
现在我们将当前梯度与上游梯度相乘,这样它们就会根据传入的梯度而改变
我们来测试一下
if __name__ == "__main__":
def f(x):
return x*x*x + x
x = Tensor([1.2])
z = f(x)
z.backward()
print(f"X: {x} grad: {x.grad}")
❯ python tinytorch.py
X: tensor([1.2]) grad: tensor([5.32])
我们来可视化一下。现在我们可以在每个点计算梯度并对其进行反向传播。并绘制出来
在我们的 `visualize.py` 中添加这段代码。我们在这里计算并绘制每个点的梯度。
import graphviz
import matplotlib.pyplot as plt
from tinytorch import *
G = graphviz.Digraph(format="png")
G.clear()
def visit_nodes(G: graphviz.Digraph, node: Tensor):
uid = str(id(node))
G.node(uid, f"Tensor: ({str(node.data) } { 'grad: '+str(node.grad.data) if node.grad is not None else ''}) ")
if node._ctx:
ctx_uid = str(id(node._ctx))
G.node(ctx_uid, f"Context: {str(node._ctx.op.__name__)}")
G.edge(uid, ctx_uid)
for child in node._ctx.args:
G.edge(ctx_uid, str(id(child)))
visit_nodes(G, child)
def f(x):
return x * x * x + x
# Defining the function to plot the given function and its derivative using the custom Tensor class
def plot_function_and_derivative():
# Values for x ranging from -3 to 3
x_values_custom = np.linspace(-3, 3, 100)
y_values_custom = []
derivative_values_custom = []
# Using the custom Tensor class to calculate the function and its derivative for each x value
for x_val in x_values_custom:
x_tensor = Tensor([x_val])
y_tensor = f(x_tensor)
y_tensor.backward()
y_values_custom.append(y_tensor.data[0])
derivative_values_custom.append(x_tensor.grad.data[0])
# Plotting the original function and its derivative using the custom implementation
plt.plot(x_values_custom, y_values_custom, label="f(x) = x^3 + x (custom)")
plt.plot(x_values_custom, derivative_values_custom, label="f'(x) = 3x^2 + 1 (custom)")
plt.xlabel('x')
plt.ylabel('y')
plt.title('Plot of the Function and its Derivative (Custom Implementation)')
plt.legend()
plt.grid(True)
plt.show()
if __name__ == "__main__":
plot_function_and_derivative()
x = Tensor([1.2])
z = f(x)
z.backward()
visit_nodes(G, z)
G.render(directory="vis", view=True)
print(f"Z:{x} grad:{x.grad}")
供参考的 tinytorch.py
import numpy as np
class Tensor:
def __init__(self, data):
self.data = data if isinstance(data, np.ndarray) else np.array(data)
self.grad = None
self._ctx = None
def __add__(self, other):
fn = Function(Add, self, other)
result = Add.forward(self, other)
result._ctx = fn
return result
def __mul__(self, other):
fn = Function(Mul, self, other)
result = Mul.forward(self, other)
result._ctx = fn
return result
def __repr__(self):
return f"tensor({self.data})"
def backward(self,grad=None):
if self._ctx is None:
return
if grad is None:
grad = Tensor([1.])
self.grad = grad
op = self._ctx.op
child_nodes = self._ctx.args
grads = op.backward(self._ctx,grad)
for tensor,grad in zip(child_nodes,grads):
if tensor.grad is None:
tensor.grad = Tensor(np.zeros_like(self.data))
tensor.grad += grad
tensor.backward(grad)
class Function:
def __init__(self, op, *args):
self.op = op
self.args = args
class Add:
@staticmethod
def forward(x, y):
return Tensor(x.data + y.data)
@staticmethod
def backward(ctx, grad):
x, y = ctx.args
return Tensor([1])*grad, Tensor([1]) *grad
class Mul:
@staticmethod
def forward(x, y):
return Tensor(x.data * y.data) # z = x*y
@staticmethod
def backward(ctx, grad):
x, y = ctx.args
return Tensor(y.data)*grad, Tensor(x.data)*grad # dz/dx, dz/dy
if __name__ == "__main__":
def f(x):
return x*x*x + x
x = Tensor([1.2])
z = f(x)
z.backward()
print(f"X: {x} grad: {x.grad}")
graphvis计算图
提交 f1fb4c1b7b0ea51d270651337c600a2a3cabe267