1,进口和eda
import osiskaggle = os.environ.get('kaggle_kernel_run_type', '')from pathlib import pathif iskaggle: path = path('/kaggle/input/us-patent-phrase-to-phrase-matching')
import pandas as pddf = pd.read_csv(path/'train.csv')df['input'] = 'text1: ' + df.context + '; text2: ' + df.target + '; anc1: ' + df.anchordf.input.head()
2,令牌化
from datasets import dataset, datasetdictds = dataset.from_pandas(df)import warnings,logging,torchwarnings.simplefilter('ignore')logging.disable(logging.warning)model_nm = 'anferico/bert-for-patents'# load model directlyfrom transformers import automodelforsequenceclassification, autotokenizermodel = automodelforsequenceclassification.from_pretrained(model_nm, num_labels=1)tokenizer = autotokenizer.from_pretrained('anferico/bert-for-patents')
def tok_func(x): return tokenizer(x['input'])# tokenize all the sentences using the tokenizertok_ds = ds.map(tok_func, batched=true)tok_ds = tok_ds.rename_columns({'score':'labels'})
3,测试和验证集
eval_df = pd.read_csv(path/'test.csv')dds = tok_ds.train_test_split(0.25, seed=42)eval_df['input'] = 'text1: ' + eval_df.context + '; text2: ' + eval_df.target + '; anc1: ' + eval_df.anchoreval_ds = dataset.from_pandas(eval_df).map(tok_func, batched=true)
4,指标和相关性
import numpy as npdef corr(x,y): ## change the 2-d array into 1-d array return np.corrcoef(x.flatten(), y)[0,1]def corr_d(eval_pred): return {'pearson': corr(*eval_pred)}
5,训练我们的模型
14625233945

6,在测试集中获取预测
preds = trainer.predict(eval_ds).predictions.astype(float)preds = np.clip(preds, 0, 1)import datasetssubmission = datasets.Dataset.from_dict({ 'id': eval_ds['id'], 'score': preds})submission.to_csv('submission.csv', index=False)
以上就是使用BERT在Kaggle上使用NLP入门的详细内容,更多请关注创想鸟其它相关文章!
版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。
如发现本站有涉嫌抄袭侵权/违法违规的内容, 请发送邮件至 chuangxiangniao@163.com 举报,一经查实,本站将立刻删除。
发布者:程序猿,转转请注明出处:https://www.chuangxiangniao.com/p/1355831.html
微信扫一扫
支付宝扫一扫