utils.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155
  1. # coding: UTF-8
  2. import os
  3. import torch
  4. import numpy as np
  5. import pickle as pkl
  6. from tqdm import tqdm
  7. import time
  8. from datetime import timedelta
  9. import pandas as pd
  10. MAX_VOCAB_SIZE = 100000 # 词表长度限制
  11. def build_vocab(file_path, tokenizer, max_size, min_freq): # 建立词表
  12. UNK, PAD = '<UNK>', '<PAD>'
  13. vocab_dic = {}
  14. with open(file_path, 'r', encoding='UTF-8') as f:
  15. for line in tqdm(f):
  16. lin = line.strip()
  17. if not lin:
  18. continue
  19. content = lin.split('\t')[0]
  20. for word in tokenizer(content):
  21. vocab_dic[word] = vocab_dic.get(word, 0) + 1
  22. vocab_list = sorted([_ for _ in vocab_dic.items() if _[1] >= min_freq], key=lambda x: x[1], reverse=True)[:max_size]
  23. vocab_dic = {word_count[0]: idx for idx, word_count in enumerate(vocab_list)}
  24. vocab_dic.update({UNK: len(vocab_dic), PAD: len(vocab_dic) + 1})
  25. return vocab_dic
  26. def build_dataset(config, ues_word): # 数据加载
  27. UNK, PAD = '<UNK>', '<PAD>'
  28. if ues_word:
  29. tokenizer = lambda x: x.split(' ') # 以空格隔开,word-level
  30. else:
  31. tokenizer = lambda x: [y for y in x] # char-level
  32. if os.path.exists(config.vocab_path):
  33. vocab = pkl.load(open(config.vocab_path, 'rb'))
  34. else:
  35. vocab = build_vocab(config.train_path, tokenizer=tokenizer, max_size=MAX_VOCAB_SIZE, min_freq=1)
  36. pkl.dump(vocab, open(config.vocab_path, 'wb'))
  37. print(f"Vocab size: {len(vocab)}")
  38. def load_dataset(path, pad_size=300):
  39. contents = []
  40. with open(path, 'r', encoding='UTF-8') as f:
  41. for line in tqdm(f):
  42. lin = line.strip()
  43. if not lin:
  44. continue
  45. content, label = lin.split('\t')
  46. words_line = []
  47. token = tokenizer(content)
  48. seq_len = len(token)
  49. if pad_size:
  50. if len(token) < pad_size:
  51. token.extend([PAD] * (pad_size - len(token)))
  52. else:
  53. token = token[:pad_size]
  54. seq_len = pad_size
  55. # word to id
  56. for word in token:
  57. words_line.append(vocab.get(word, vocab.get(UNK)))
  58. contents.append((words_line, int(label), seq_len))
  59. return contents # [([...], 0), ([...], 1), ...]
  60. train = load_dataset(config.train_path, config.pad_size)
  61. dev = load_dataset(config.dev_path, config.pad_size)
  62. test = load_dataset(config.test_path, config.pad_size)
  63. return vocab, train, dev, test
  64. class DatasetIterater(object):
  65. def __init__(self, batches, batch_size, device,model_name):
  66. self.batch_size = batch_size
  67. self.batches = batches
  68. self.n_batches = len(batches) // batch_size
  69. self.residue = False # 记录batch数量是否为整数
  70. if len(batches) % self.n_batches != 0:
  71. self.residue = True
  72. self.index = 0
  73. self.device = device
  74. self.model_name = model_name
  75. def _to_tensor(self, datas):
  76. x = torch.LongTensor([_[0] for _ in datas]).to(self.device)
  77. y = torch.LongTensor([_[1] for _ in datas]).to(self.device)
  78. # pad前的长度(超过pad_size的设为pad_size)
  79. seq_len = torch.LongTensor([_[2] for _ in datas]).to(self.device)
  80. if self.model_name == 'Bert':
  81. mask = torch.LongTensor([_[3] for _ in datas]).to(self.device)
  82. return (x, seq_len, mask), y
  83. return (x, seq_len), y
  84. def __next__(self):
  85. if self.residue and self.index == self.n_batches:
  86. batches = self.batches[self.index * self.batch_size: len(self.batches)]
  87. self.index += 1
  88. batches = self._to_tensor(batches)
  89. return batches
  90. elif self.index >= self.n_batches:
  91. self.index = 0
  92. raise StopIteration
  93. else:
  94. batches = self.batches[self.index * self.batch_size: (self.index + 1) * self.batch_size]
  95. self.index += 1
  96. batches = self._to_tensor(batches)
  97. return batches
  98. def __iter__(self):
  99. return self
  100. def __len__(self):
  101. if self.residue:
  102. return self.n_batches + 1
  103. else:
  104. return self.n_batches
  105. def build_iterator(dataset, config):
  106. iter = DatasetIterater(dataset, config.batch_size, config.device,config.model_name)
  107. return iter
  108. def get_vocab():
  109. df = pd.read_csv('../data/vocab.pkl', names=['word', 'id'])
  110. return list(df['word']), dict(df.values)
  111. def get_time_dif(start_time):
  112. """获取已使用时间"""
  113. end_time = time.time()
  114. time_dif = end_time - start_time
  115. return timedelta(seconds=int(round(time_dif)))
  116. if __name__ == "__main__":
  117. '''提取预训练词向量'''
  118. data_path = '../data'
  119. train_dir = data_path + "/train.txt"
  120. vocab_dir = data_path + "/vocab.pkl"
  121. # pretrain_dir = "./data/word_embedding/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5"
  122. emb_dim = 300
  123. filename_trimmed_dir = "../data/word_embedding/embedding_table"
  124. if os.path.exists(vocab_dir):
  125. word_to_id = pkl.load(open(vocab_dir, 'rb'))
  126. else:
  127. # tokenizer = lambda x: x.split(' ') # 以词为单位构建词表(数据集中词之间以空格隔开)
  128. tokenizer = lambda x: [y for y in x] # 以字为单位构建词表
  129. word_to_id = build_vocab(train_dir, tokenizer=tokenizer, max_size=MAX_VOCAB_SIZE, min_freq=1)
  130. pkl.dump(word_to_id, open(vocab_dir, 'wb'))
  131. print(word_to_id)
  132. embeddings = np.random.rand(len(word_to_id), emb_dim)
  133. np.save(filename_trimmed_dir, embeddings=embeddings)