LoRA 代码填空 (Level 2-3)
本练习基于 LoRA(Low-Rank Adaptation)论文的核心思想,从低秩矩阵初始化到完整模块实现,逐步掌握参数高效微调技术。
前置知识
- 线性代数基础(矩阵乘法、低秩分解)
- PyTorch 基础(
nn.Module、nn.Linear、nn.Parameter) - Transformer 架构中的线性层
LoRA 核心公式
其中
练习 1:LoRA 低秩矩阵初始化与 forward(Level 2)
LoRA 的关键设计:A 矩阵使用 Kaiming 初始化(保证训练初期有梯度),B 矩阵初始化为零(保证训练开始时
请补全 __init__ 中的矩阵初始化和 forward 中的前向计算。
python
import torch
import torch.nn as nn
import torch.nn.functional as F
class LoRALinear(nn.Module):
def __init__(self, original_linear, rank=4, alpha=0.1):
super().__init__()
self.alpha = alpha
self.dim_in = original_linear.in_features
self.dim_out = original_linear.out_features
self.r = rank
# 冻结原始权重
self.weight = nn.Parameter(
original_linear.weight.data.clone(), requires_grad=False
)
if original_linear.bias is not None:
self.bias = nn.Parameter(
original_linear.bias.data.clone(), requires_grad=False
)
else:
self.register_parameter('bias', None)
# LoRA 低秩矩阵 —— 使用 nn.Linear 封装
self.WA = nn.Linear(self.dim_in, self.r, bias=False)
self.WB = nn.Linear(self.r, self.dim_out, bias=False)
# TODO: 初始化 WA(Kaiming uniform)和 WB(全零)
# 目的: 训练初始时 Delta_W = WB @ WA = 0,不改变预训练行为
_____ # WA: Kaiming uniform 初始化
_____ # WB: 零初始化
def forward(self, X):
# 原始线性层的前向计算(无梯度)
h = F.linear(X, self.weight, self.bias)
# TODO: 计算 LoRA 旁路并叠加
# h_lora = h + alpha * WB(WA(X))
h_lora = _____
return h_lora提示
nn.init.kaiming_uniform_用于 Kaiming 初始化,参数是self.WA.weightnn.init.zeros_用于零初始化- LoRA 旁路: 输入先过 WA(降维到 r),再过 WB(升维回 d_out),乘以 alpha 后与原始输出相加
点击查看答案
python
# 初始化部分
nn.init.kaiming_uniform_(self.WA.weight)
nn.init.zeros_(self.WB.weight)
# forward 部分
h_lora = h + self.alpha * self.WB(self.WA(X))解析:
- WA 使用 Kaiming 初始化:保证在 ReLU/GeLU 等激活函数下,前向传播的方差稳定,训练初期能产生有意义的梯度。
- WB 初始化为零:这是 LoRA 的核心设计。因为
,当 时, ,模型在训练开始时的行为与预训练模型完全一致。 - forward 中:
self.WA(X)将输入从维降到 维, self.WB(...)再升回维。乘以 alpha控制旁路强度。
练习 2:LoRA 的 merge 和 unmerge(Level 2)
在推理阶段,可以将 LoRA 的低秩矩阵合并回原始权重,这样推理时没有额外开销。请补全 merge 和 unmerge 方法。
python
class LoRALinear(nn.Module):
def __init__(self, original_linear, rank=4, alpha=0.1):
super().__init__()
self.alpha = alpha
self.dim_in = original_linear.in_features
self.dim_out = original_linear.out_features
self.r = rank
self.merged = False
self.weight = nn.Parameter(
original_linear.weight.data.clone(), requires_grad=False
)
if original_linear.bias is not None:
self.bias = nn.Parameter(
original_linear.bias.data.clone(), requires_grad=False
)
else:
self.register_parameter('bias', None)
self.WA = nn.Linear(self.dim_in, self.r, bias=False)
self.WB = nn.Linear(self.r, self.dim_out, bias=False)
nn.init.kaiming_uniform_(self.WA.weight)
nn.init.zeros_(self.WB.weight)
def merge(self):
"""将 LoRA 参数合并到原始权重中"""
if not self.merged:
# TODO: 计算 delta_W 并加到 self.weight 上
# 注意 nn.Linear 的 weight 形状是 [out, in]
# WB.weight: [dim_out, r],WA.weight: [r, dim_in]
delta_W = _____
self.weight.data += _____
self.merged = True
def unmerge(self):
"""从权重中移除 LoRA 参数"""
if self.merged:
# TODO: 将 merge 的操作反转
delta_W = _____
self.weight.data -= _____
self.merged = False
def forward(self, X):
if self.merged:
# 合并后直接用原始线性层
return F.linear(X, self.weight, self.bias)
else:
h = F.linear(X, self.weight, self.bias)
h_lora = h + self.alpha * self.WB(self.WA(X))
return h_lora提示
nn.Linear的weight形状是[out_features, in_features],即转置存储delta_W = WB.weight @ WA.weight,形状为[dim_out, r] @ [r, dim_in] = [dim_out, dim_in]- 别忘了乘
self.alpha
点击查看答案
python
def merge(self):
if not self.merged:
# WB.weight: [dim_out, r], WA.weight: [r, dim_in]
delta_W = self.WB.weight @ self.WA.weight
self.weight.data += self.alpha * delta_W
self.merged = True
def unmerge(self):
if self.merged:
delta_W = self.WB.weight @ self.WA.weight
self.weight.data -= self.alpha * delta_W
self.merged = False解析:
- merge 操作:
。合并后,原始权重已经包含了微调信息,推理时只需一次矩阵乘法,没有额外计算开销。 - unmerge 操作:是 merge 的逆操作,
。当需要继续训练或切换不同的 LoRA adapter 时使用。 - 矩阵形状:
nn.Linear内部存储的 weight 形状为[out, in](已转置),所以WB.weight @ WA.weight的形状恰好是[dim_out, dim_in],与self.weight一致。
练习 3:完整 LoRALinear 模块实现(Level 3)
给定接口,请从零实现一个完整的 LoRALinear 模块。要求:
- 冻结原始权重(
requires_grad=False) - 正确初始化 LoRA 矩阵(A: Kaiming,B: zeros)
- 实现 forward(原始路径 + LoRA 旁路)
- 实现 merge/unmerge
python
import torch
import torch.nn as nn
import torch.nn.functional as F
class LoRALinear(nn.Module):
"""
将一个 nn.Linear 层替换为带 LoRA 旁路的版本。
参数:
original_linear: 原始的 nn.Linear 层
rank: LoRA 的秩 r
alpha: 缩放系数
forward 行为:
未合并: h = X @ W^T + bias + alpha * WB(WA(X))
已合并: h = X @ (W + alpha * WB @ WA)^T + bias
"""
def __init__(self, original_linear, rank=4, alpha=0.1):
super().__init__()
# TODO: 实现完整的初始化
_____
def forward(self, X):
# TODO: 实现前向传播
_____
def merge(self):
# TODO: 将 LoRA 合并到原始权重
_____
def unmerge(self):
# TODO: 从原始权重中移除 LoRA
_____
# ====== 测试代码 ======
torch.manual_seed(42)
linear = nn.Linear(512, 256)
x = torch.randn(2, 10, 512)
lora = LoRALinear(linear, rank=8, alpha=0.5)
# 测试 1: 训练初始时输出应与原始 Linear 一致(因为 WB=0)
with torch.no_grad():
y_original = linear(x)
y_lora = lora(x)
assert torch.allclose(y_original, y_lora, atol=1e-5), "初始输出不一致!"
# 测试 2: merge 后输出不变
lora.merge()
with torch.no_grad():
y_merged = lora(x)
# 注意: merge 前 WB 可能已经训练过,这里直接 merge 初始化的参数
# 初始时 WB=0,所以 merge 后输出仍一致
assert torch.allclose(y_original, y_merged, atol=1e-5), "merge 后输出不一致!"
# 测试 3: unmerge 后输出不变
lora.unmerge()
with torch.no_grad():
y_unmerged = lora(x)
assert torch.allclose(y_original, y_unmerged, atol=1e-5), "unmerge 后输出不一致!"
print("所有测试通过!")提示
- 参考练习 1 和练习 2 的代码
- 注意处理
original_linear.bias可能为None的情况 F.linear(X, weight, bias)等价于X @ weight.T + bias- 使用
self.merged标志位控制 forward 的行为
点击查看答案
python
class LoRALinear(nn.Module):
def __init__(self, original_linear, rank=4, alpha=0.1):
super().__init__()
self.alpha = alpha
self.dim_in = original_linear.in_features
self.dim_out = original_linear.out_features
self.r = rank
self.merged = False
# 冻结原始权重
self.weight = nn.Parameter(
original_linear.weight.data.clone(), requires_grad=False
)
if original_linear.bias is not None:
self.bias = nn.Parameter(
original_linear.bias.data.clone(), requires_grad=False
)
else:
self.register_parameter('bias', None)
# LoRA 低秩矩阵
self.WA = nn.Linear(self.dim_in, self.r, bias=False)
self.WB = nn.Linear(self.r, self.dim_out, bias=False)
# 初始化: A用Kaiming, B用零
nn.init.kaiming_uniform_(self.WA.weight)
nn.init.zeros_(self.WB.weight)
def forward(self, X):
if self.merged:
return F.linear(X, self.weight, self.bias)
else:
h = F.linear(X, self.weight, self.bias)
h_lora = h + self.alpha * self.WB(self.WA(X))
return h_lora
def merge(self):
if not self.merged:
delta_W = self.WB.weight @ self.WA.weight
self.weight.data += self.alpha * delta_W
self.merged = True
def unmerge(self):
if self.merged:
delta_W = self.WB.weight @ self.WA.weight
self.weight.data -= self.alpha * delta_W
self.merged = False解析:
完整的 LoRALinear 需要注意以下关键点:
- 权重冻结:原始 weight 设置
requires_grad=False,只训练 WA 和 WB。 - 初始化策略:WB 初始化为零确保
,这是 LoRA 能够即插即用的关键。 - merge/unmerge 的对称性:merge 加上
,unmerge 减去相同的量。 - 参数量对比:原始 Linear 参数量为
,LoRA 只需 。当 , 时,参数量减少约 64 倍。
练习 4:给预训练模型注入 LoRA(Level 3)
实现 inject_lora 函数,遍历模型的所有层,将指定名称的 nn.Linear 层替换为 LoRALinear。这是实际使用 LoRA 微调时的核心步骤。
python
import torch
import torch.nn as nn
import torch.nn.functional as F
# 假设已有完整的 LoRALinear(来自练习3)
class Attention(nn.Module):
"""简化的注意力模块"""
def __init__(self, dim_in, dim_out):
super().__init__()
self.Wq = nn.Linear(dim_in, dim_out, bias=False)
self.Wk = nn.Linear(dim_in, dim_out, bias=False)
self.Wv = nn.Linear(dim_in, dim_out, bias=False)
self.Wo = nn.Linear(dim_out, dim_out, bias=False)
def forward(self, X):
Q, K, V = self.Wq(X), self.Wk(X), self.Wv(X)
S = Q @ K.transpose(1, 2)
P = F.softmax(S, dim=-1)
Z = P @ V
O = self.Wo(Z)
return O
class TransformerBlock(nn.Module):
"""简化的 Transformer 块"""
def __init__(self, dim):
super().__init__()
self.attn = Attention(dim, dim)
self.ffn = nn.Sequential(
nn.Linear(dim, dim * 4),
nn.GELU(),
nn.Linear(dim * 4, dim),
)
def forward(self, X):
X = X + self.attn(X)
X = X + self.ffn(X)
return X
def inject_lora(model, target_modules, rank=4, alpha=0.1):
"""
遍历模型,将 target_modules 中指定名称的 nn.Linear 替换为 LoRALinear。
参数:
model: 要注入的模型
target_modules: list[str],要替换的模块名称(如 ["Wq", "Wk", "Wv"])
rank: LoRA 秩
alpha: 缩放系数
返回:
修改后的 model
"""
# TODO: 递归遍历模型,替换匹配的 Linear 层
for name, module in model.named_children():
if isinstance(module, nn.Linear) and _____:
# 替换为 LoRALinear
_____
_____
print(f"Replaced {name}")
else:
# TODO: 递归处理子模块
_____
return model
# ====== 测试代码 ======
torch.manual_seed(42)
dim = 128
model = TransformerBlock(dim)
x = torch.randn(1, 10, dim)
# 注入前的参数量
total_before = sum(p.numel() for p in model.parameters())
# 只对注意力的 Q/K/V 投影注入 LoRA
inject_lora(model, target_modules=["Wq", "Wk", "Wv"], rank=4, alpha=0.1)
# 注入后的参数量
total_after = sum(p.numel() for p in model.parameters())
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"注入前总参数: {total_before:,}")
print(f"注入后总参数: {total_after:,}")
print(f"可训练参数: {trainable:,}")
print(f"可训练比例: {trainable/total_after*100:.2f}%")
# 验证前向传播正常
y = model(x)
print(f"输出形状: {y.shape}") # 应为 [1, 10, 128]提示
- 判断条件:
name in target_modules - 替换用
setattr(model, name, new_layer) - 递归调用: 对非 Linear 子模块递归调用
inject_lora - 类似课程代码中的
apply_lora_adapter函数
点击查看答案
python
def inject_lora(model, target_modules, rank=4, alpha=0.1):
for name, module in model.named_children():
if isinstance(module, nn.Linear) and name in target_modules:
new_layer = LoRALinear(module, rank=rank, alpha=alpha)
setattr(model, name, new_layer)
print(f"Replaced {name}")
else:
inject_lora(module, target_modules, rank, alpha)
return model解析:
- 递归遍历:
model.named_children()只返回直接子模块。对于嵌套结构(如TransformerBlock内的Attention),必须递归处理。 - 名称匹配:
name in target_modules允许精确控制哪些层被替换。实际使用中常见的 target 包括["q_proj", "k_proj", "v_proj", "o_proj"]。 setattr替换:直接修改父模块的属性,将原始 Linear 替换为 LoRALinear。PyTorch 会自动更新计算图。- 参数效率:对于 dim=128, rank=4 的设置,每个 LoRA 层只增加
个参数,远小于原始 。
与 HuggingFace PEFT 的对比:
python
# HuggingFace PEFT 的等价操作
from peft import LoraConfig, get_peft_model
config = LoraConfig(
r=16,
lora_alpha=8,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM"
)
model = get_peft_model(model, config)MLM 代码训练模式
完成上面的固定填空后,试试随机挖空模式 -- 每次点击「刷新」会随机遮盖不同的代码片段,帮你彻底记住每一行。
LoRALinear 完整实现
共 116 个可挖空位 | 已挖 0 个
class LoRALinear(nn.Module):
def __init__(self, original_linear, rank=4, alpha=0.1):
super().__init__()
self.alpha = alpha
self.dim_in = original_linear.in_features
self.dim_out = original_linear.out_features
self.r = rank
self.merged = False
self.weight = nn.Parameter(
original_linear.weight.data.clone(), requires_grad=False
)
if original_linear.bias is not None:
self.bias = nn.Parameter(
original_linear.bias.data.clone(), requires_grad=False
)
else:
self.register_parameter('bias', None)
self.WA = nn.Linear(self.dim_in, self.r, bias=False)
self.WB = nn.Linear(self.r, self.dim_out, bias=False)
nn.init.kaiming_uniform_(self.WA.weight)
nn.init.zeros_(self.WB.weight)
def forward(self, X):
if self.merged:
return F.linear(X, self.weight, self.bias)
h = F.linear(X, self.weight, self.bias)
h_lora = h + self.alpha * self.WB(self.WA(X))
return h_lora权重合并与拆分
共 45 个可挖空位 | 已挖 0 个
def merge(self):
if not self.merged:
delta_W = self.WB.weight @ self.WA.weight
self.weight.data += self.alpha * delta_W
self.merged = True
def unmerge(self):
if self.merged:
delta_W = self.WB.weight @ self.WA.weight
self.weight.data -= self.alpha * delta_W
self.merged = False模型注入 LoRA
共 41 个可挖空位 | 已挖 0 个
def inject_lora(model, target_modules, rank=4, alpha=0.1):
for name, module in model.named_children():
if isinstance(module, nn.Linear) and name in target_modules:
new_layer = LoRALinear(module, rank=rank, alpha=alpha)
setattr(model, name, new_layer)
else:
inject_lora(module, target_modules, rank, alpha)
return model