繁体   English   中英

pytorch如何进行权重正则化?

[英]How to perform weight regularization in pytorch?

我在 pytorch 中实现了一个 model,它在运行 softmax function 之前应用了一个最终的完全连接层。该架构被定义为解决一个 4 类语音情感识别任务:给定一个音轨,它将它转换成它的频谱图并使用它来预测快乐、悲伤、中性和愤怒之间的情绪。

与本文的架构不同,它尝试调整在 Github 上找到的紧凑型卷积变压器的实现,链接为 https://github.com/SHI-Labs/Compact-Transformers/blob/main/src/cct.py

为了提高 model 的性能,我遵循了论文https://arxiv.org/abs/2104.07288中定义的一些技巧。 然而,就像论文中描述的那样,我的 model 也存在“类崩溃”问题:即使通过平衡数据集,它也倾向于很好地预测愤怒和悲伤类,而其他两个类则很差。

在解决这个问题的论文中,他们将一种特殊的权重正则化技术应用于全连接层,如第 2.4 章所述。

然而不幸的是,我不明白我应该如何修改 pytorch 中的全连接层来实现这种类型的正则化。

model的代码:

class CCT(nn.Module):
def __init__(self,
             img_size=224,
             embedding_dim=768,
             n_input_channels=3,
             n_conv_layers=1,
             kernel_size=7,
             stride=2,
             padding=3,
             pooling_kernel_size=3,
             pooling_stride=2,
             pooling_padding=1,
             dropout=0.,
             attention_dropout=0.1,
             stochastic_depth=0.1,
             num_layers=14,
             num_heads=6,
             mlp_ratio=4.0,
             num_classes=1000,
             positional_embedding='learnable',
             *args, **kwargs):
    super(CCT, self).__init__()

    self.tokenizer = Tokenizer(n_input_channels=n_input_channels,
                               n_output_channels=embedding_dim,
                               kernel_size=kernel_size,
                               stride=stride,
                               padding=padding,
                               pooling_kernel_size=pooling_kernel_size,
                               pooling_stride=pooling_stride,
                               pooling_padding=pooling_padding,
                               max_pool=True,
                               activation=nn.ReLU,
                               n_conv_layers=n_conv_layers,
                               conv_bias=False)

    self.classifier = TransformerClassifier(
        sequence_length=self.tokenizer.sequence_length(n_channels=n_input_channels,
                                                       height=img_size,
                                                       width=img_size),
        embedding_dim=embedding_dim,
        seq_pool=True,
        dropout=dropout,
        attention_dropout=attention_dropout,
        stochastic_depth=stochastic_depth,
        num_layers=num_layers,
        num_heads=num_heads,
        mlp_ratio=mlp_ratio,
        num_classes=num_classes,
        positional_embedding=positional_embedding
    )

def forward(self, x):
    x = self.tokenizer(x)
    return self.classifier(x)

class Tokenizer(nn.Module):
def __init__(self,
             kernel_size, stride, padding,
             pooling_kernel_size=3, pooling_stride=2, pooling_padding=1,
             n_conv_layers=1,
             n_input_channels=3,
             n_output_channels=64,
             in_planes=64,
             activation=None,
             max_pool=True,
             conv_bias=False):
    super(Tokenizer, self).__init__()

    n_filter_list = [n_input_channels] + \
                    [in_planes for _ in range(n_conv_layers - 1)] + \
                    [n_output_channels]

    self.conv_layers = nn.Sequential(
        *[nn.Sequential(
            nn.Conv2d(n_filter_list[i], n_filter_list[i + 1],
                      kernel_size=(kernel_size, kernel_size),
                      stride=(stride, stride),
                      padding=(padding, padding), bias=conv_bias),
            nn.Identity() if activation is None else activation(),
            nn.MaxPool2d(kernel_size=pooling_kernel_size,
                         stride=pooling_stride,
                         padding=pooling_padding) if max_pool else nn.Identity()
        )
            for i in range(n_conv_layers)
        ])

    self.flattener = nn.Flatten(2, 3)
    self.apply(self.init_weight)

def sequence_length(self, n_channels=3, height=224, width=224):
    return self.forward(torch.zeros((1, n_channels, height, width))).shape[1]

def forward(self, x):
    return self.flattener(self.conv_layers(x)).transpose(-2, -1)

@staticmethod
def init_weight(m):
    if isinstance(m, nn.Conv2d):
        nn.init.kaiming_normal_(m.weight)

class TransformerClassifier(Module):
def __init__(self,
             seq_pool=True,
             embedding_dim=768,
             num_layers=12,
             num_heads=12,
             mlp_ratio=4.0,
             num_classes=1000,
             dropout=0.1,
             attention_dropout=0.1,
             stochastic_depth=0.1,
             positional_embedding='learnable',
             sequence_length=None):
    super().__init__()
    positional_embedding = positional_embedding if \
        positional_embedding in ['sine', 'learnable', 'none'] else 'sine'
    dim_feedforward = int(embedding_dim * mlp_ratio)
    self.embedding_dim = embedding_dim
    self.sequence_length = sequence_length
    self.seq_pool = seq_pool
    self.num_tokens = 0

    assert sequence_length is not None or positional_embedding == 'none', \
        f"Positional embedding is set to {positional_embedding} and" \
        f" the sequence length was not specified."

    if not seq_pool:
        sequence_length += 1
        self.class_emb = Parameter(torch.zeros(1, 1, self.embedding_dim),
                                   requires_grad=True)
        self.num_tokens = 1
    else:
        self.attention_pool = Linear(self.embedding_dim, 1)

    if positional_embedding != 'none':
        if positional_embedding == 'learnable':
            self.positional_emb = Parameter(torch.zeros(1, sequence_length, embedding_dim),
                                            requires_grad=True)
            init.normal_(self.positional_emb, std=0.2)
        else:
            self.positional_emb = Parameter(self.sinusoidal_embedding(sequence_length, embedding_dim),
                                            requires_grad=False)
    else:
        self.positional_emb = None

    self.dropout = Dropout(p=dropout)
    dpr = [x.item() for x in torch.linspace(0, stochastic_depth, num_layers)]
    self.blocks = ModuleList([
        TransformerEncoderLayer(d_model=embedding_dim, nhead=num_heads,
                                dim_feedforward=dim_feedforward, dropout=dropout,
                                attention_dropout=attention_dropout, drop_path_rate=dpr[i])
        for i in range(num_layers)])
    self.norm = LayerNorm(embedding_dim)

    self.fc = Linear(embedding_dim, num_classes)
    self.apply(self.init_weight)

def forward(self, x):
    if self.positional_emb is None and x.size(1) < self.sequence_length:
        x = F.pad(x, (0, 0, 0, self.n_channels - x.size(1)), mode='constant', value=0)

    if not self.seq_pool:
        cls_token = self.class_emb.expand(x.shape[0], -1, -1)
        x = torch.cat((cls_token, x), dim=1)

    if self.positional_emb is not None:
        x += self.positional_emb

    x = self.dropout(x)

    for blk in self.blocks:
        x = blk(x)
    x = self.norm(x)

    if self.seq_pool:
        x = torch.matmul(F.softmax(self.attention_pool(x), dim=1).transpose(-1, -2), x).squeeze(-2)
    else:
        x = x[:, 0]

    x = self.fc(x)
    return x

有人能帮我吗?

由于您没有共享任何网络架构,我将尝试给出一个基本示例。 我不确定论文的正则化,但我会举一个简单的例子,将 L1 正则化应用于特定层(例如第 0 层)

my_model = nn.Sequential(
    nn.Linear(5, 5),
    nn.ReLU(),
    nn.Linear(5, 2)
)

x = torch.randn(5, 5)
target = torch.ones(5, dtype=torch.long)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-2)

for epoch in range(10):
    optimizer.zero_grad()
    output = model(x)
    loss = criterion(output, target)
    # This will be your weight regularization, choose your layer like model[0] and apply normalization that you want on that layer weights
    l1_norm = torch.norm(model[0].weight, p=1)
    loss += l1_norm
    loss.backward()
    optimizer.step()
    
    print('Epoch {}, loss {}, norm layer {}'.format(
        epoch, loss.item(), l1_norm.item()))

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM