1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859 |
- # coding:utf-8
- import pandas as pd
- import re
- def text_parse(text):
- # 正则过滤掉特殊符号,标点,英文,数字等
- reg_1 = '[!"#%&\'()*+,-./::;;|=?@,\t—。?★、…【】《》?“”‘’![\\]^_`{|}~]+ '
- # 去除空格
- reg_2 = '\\s+'
- text = re.sub(reg_1, ' ', text)
- text = re.sub(reg_2, '', text)
- # 去除换行符
- text = text.replace('\n', '')
- text = re.sub(reg_2, '', text)
- return text
- def split_data(file_path):
- df = pd.read_csv(file_path, encoding='utf-8')
- df.text = df.text.map(text_parse)
- df['label_id'] = df.label
- df = df[['text', 'label_id']]
- df = df.drop(df[df['text'].map(len) <2].index)
- df = df.drop(df[df['text'].map(len) > 250].index)
- df.drop_duplicates(inplace=True)
- # 0.7,0.15,0.15比例划分训练集,测试集,验证集
- df = df.sample(frac=1.0)
- rows, cols = df.shape
- split_index_1 = int(rows * 0.15)
- split_index_2 = int(rows * 0.3)
- # 数据分割
- df_test = df.iloc[0:split_index_1, :]
- df_dev = df.iloc[split_index_1:split_index_2, :]
- df_train = df.iloc[split_index_2: rows, :]
- df_test.to_csv('./data/test.txt', sep="\t", index=False, header=None, encoding='utf-8')
- df_train.to_csv('./data/train.txt', sep="\t", index=False, header=None, encoding='utf-8')
- df_dev.to_csv('./data/dev.txt', sep="\t", index=False, header=None,encoding='utf-8')
- return df_test, df_dev, df_train
- def data_show(file):
- df = pd.read_csv(file, names=['text', 'label'], sep='\t', encoding='utf-8')
- len_list = []
- for i in df.text:
- len_list.append(len(i))
- import matplotlib.pyplot as plt
- import numpy as np
- len_list.sort()
- plt.bar(np.arange(len(len_list)), len_list)
- plt.show()
- if __name__ == '__main__':
- split_data('../data/other_data/target_label3.csv')
- # pass
|