Kaggle首战记录(3)-English Language Learning-baseline的设计和训练
基于上述原因的权衡,baseline采取roberta-base + 两层全连接层的模式。采用roberta最后一层的隐藏层的第一个向量(也就是CLS的embedding),经过全连接层——batchnorm层——relu层——全连接层——sigmoid到(0, 6)作为输出。损失函数采用MSE。
batchnorm是代替dropout的正则化方法,但用在此是有疑问的 。经过测试,roberta大概占1700MB显存,使用AdamW优化器的情况下,结合上一篇文章的数据处理方法(一个句子最多5个子句,说明一个batch的向量数最多是batchsize * 5),刚好能支撑batchsize = 4的情况(这就是不选deberta的原因——参数量大、训练慢,而且batchsize更小)。但是batchnorm对小批量的影响肯定会比较差。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 path = '../input/feedback-prize-english-language-learning/train.csv' import pandas as pd data = pd.read_csv(path) data['full_text' ] = data['full_text' ].apply(lambda x: x.strip())import torchfrom torch import nnimport torch.nn.functional as Fimport numpy as npimport randomdef init_seeds (seed=7 ): random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) if seed == 0 : torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False device = "cuda" if torch.cuda.is_available() else "cpu" init_seeds(42 )from transformers import RobertaTokenizer, RobertaModel tokenizer = RobertaTokenizer.from_pretrained('../input/roberta-base' ) model = RobertaModel.from_pretrained('../input/roberta-base' ).to(device)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 class Class_Pool_Net (nn.Module): def __init__ (self, batch_size, pretrained_model, device ): super (Class_Pool_Net, self).__init__() self.batch_size = batch_size self.model = pretrained_model self.linear1 = nn.Linear(768 , 256 ).to(device) self.batchnorm = nn.BatchNorm1d(256 ).to(device) self.linear2 = nn.Linear(256 , 6 ).to(device) def forward (self, x ): output_embedding = self.model(x)['last_hidden_state' ][:,0 ].reshape([self.batch_size, -1 , 768 ]) y1 = F.adaptive_max_pool2d(input =output_embedding, output_size=(1 , 768 )).squeeze(1 ) y2 = self.linear1(y1) y3 = self.batchnorm(y2) y4 = F.relu(y3) y5 = self.linear2(y4) y6 = torch.sigmoid(y5) * 6 return y6 def change_batch_size (self, size ): self.batch_size = size
1 2 3 4 5 6 7 8 9 10 11 12 13 from sklearn.metrics import mean_squared_errordef evaluate_function (y_preds, y_trues ): scores = [] y_preds = y_preds.cpu() y_trues = y_trues.cpu() idxes = y_trues.shape[1 ] for i in range (idxes): y_true = y_trues[:,i] y_pred = y_preds[:,i] score = mean_squared_error(y_true, y_pred, squared=False ) scores.append(score) mcrmse_score = np.mean(scores) return mcrmse_score
1 2 3 4 5 6 7 8 9 10 11 12 13 def get_group_parameters (model ): params = list (model.named_parameters()) no_decay = ['bias' ,'LayerNorm' , 'batchnorm' ] other = ['linear1' , 'linear2' ] no_main = no_decay + other param_group = [ {'params' :[p for n,p in params if not any (nd in n for nd in no_main)],'weight_decay' :1e-2 ,'lr' :1e-5 }, {'params' :[p for n,p in params if not any (nd in n for nd in other) and any (nd in n for nd in no_decay) ],'weight_decay' :0 ,'lr' :1e-5 }, {'params' :[p for n,p in params if any (nd in n for nd in other) and any (nd in n for nd in no_decay) ],'weight_decay' :0 ,'lr' :1e-2 }, {'params' :[p for n,p in params if any (nd in n for nd in other) and not any (nd in n for nd in no_decay) ],'weight_decay' :1e-2 ,'lr' :1e-2 }, ] return param_group
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 batch_size = 4 epoch_num = 10 lr = 1e-5 loss = nn.MSELoss() accumulate_steps = 80 train_dataset = writing_dataset() val_dataset = writing_dataset(data=data, typ='val' ) train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, collate_fn=collate, drop_last=True ) val_loader = DataLoader(dataset=val_dataset, batch_size=1 , collate_fn=collate) net = Class_Pool_Net(batch_size, model, device) param = get_group_parameters(net) optimizer = torch.optim.AdamW(param, lr=lr) scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[2 , 4 , 6 , 8 ], gamma=0.4 ) lo = 0 min_lo = 2e5 min_mse = 200 for epoch in range (epoch_num): lo = 0 net.train() net.batch_size=batch_size for i, (X, y) in enumerate (train_loader): X = X.to(device) y = y.to(device) y_hat = net(X) l = loss(y_hat, y) l = l / accumulate_steps lo += l.item() l.backward() if (i + 1 ) % accumulate_steps == 0 or (i + 1 ) == len (train_loader): optimizer.step() optimizer.zero_grad() if (i + 1 ) % (accumulate_steps * 8 ) == 0 or (i + 1 ) == len (train_loader): print (f'the {epoch} th: {100 * (i + 1 ) / len (train_loader)} %' ) mse = 0 net.eval () net.batch_size=1 ypred = None ytrue = None for i,(X, y) in enumerate (val_loader): with torch.no_grad(): X = X.to(device) y = y.to(device) y_hat = net(X) if i == 0 : ypred = y_hat ytrue = y else : ypred = torch.cat((ypred, y_hat)) ytrue = torch.cat((ytrue, y)) mse = evaluate_function(ypred, ytrue) print (f'mse: {mse} .' ) if mse < min_mse: torch.save({'model' : net.state_dict()}, f'./minmse_fold.pth' ) min_mse = mse net.train() net.batch_size=batch_size scheduler.step() if lo < min_lo: torch.save({'model' : net.state_dict()}, f'./minloss_fold.pth' ) min_lo = lo print (f'{epoch} th epoch: last loss: {lo * accumulate_steps} .' ) if mse < min_mse: torch.save({'model' : net.state_dict()}, f'./minmse_fold.pth' ) min_mse = mse torch.save({'model' : net.state_dict()}, f'./last_fold.pth' )print (f'End.' )