process.py 1.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859
  1. # coding:utf-8
  2. import pandas as pd
  3. import re
  4. def text_parse(text):
  5. # 正则过滤掉特殊符号,标点,英文,数字等
  6. reg_1 = '[!"#%&\'()*+,-./::;;|=?@,\t—。?★、…【】《》?“”‘’![\\]^_`{|}~]+  '
  7. # 去除空格
  8. reg_2 = '\\s+'
  9. text = re.sub(reg_1, ' ', text)
  10. text = re.sub(reg_2, '', text)
  11. # 去除换行符
  12. text = text.replace('\n', '')
  13. text = re.sub(reg_2, '', text)
  14. return text
  15. def split_data(file_path):
  16. df = pd.read_csv(file_path, encoding='utf-8')
  17. df.text = df.text.map(text_parse)
  18. df['label_id'] = df.label
  19. df = df[['text', 'label_id']]
  20. df = df.drop(df[df['text'].map(len) <2].index)
  21. df = df.drop(df[df['text'].map(len) > 250].index)
  22. df.drop_duplicates(inplace=True)
  23. # 0.7,0.15,0.15比例划分训练集,测试集,验证集
  24. df = df.sample(frac=1.0)
  25. rows, cols = df.shape
  26. split_index_1 = int(rows * 0.15)
  27. split_index_2 = int(rows * 0.3)
  28. # 数据分割
  29. df_test = df.iloc[0:split_index_1, :]
  30. df_dev = df.iloc[split_index_1:split_index_2, :]
  31. df_train = df.iloc[split_index_2: rows, :]
  32. df_test.to_csv('./data/test.txt', sep="\t", index=False, header=None, encoding='utf-8')
  33. df_train.to_csv('./data/train.txt', sep="\t", index=False, header=None, encoding='utf-8')
  34. df_dev.to_csv('./data/dev.txt', sep="\t", index=False, header=None,encoding='utf-8')
  35. return df_test, df_dev, df_train
  36. def data_show(file):
  37. df = pd.read_csv(file, names=['text', 'label'], sep='\t', encoding='utf-8')
  38. len_list = []
  39. for i in df.text:
  40. len_list.append(len(i))
  41. import matplotlib.pyplot as plt
  42. import numpy as np
  43. len_list.sort()
  44. plt.bar(np.arange(len(len_list)), len_list)
  45. plt.show()
  46. if __name__ == '__main__':
  47. split_data('../data/other_data/target_label3.csv')
  48. # pass