# coding:utf-8 import pandas as pd import re def text_parse(text): # 正则过滤掉特殊符号,标点,英文,数字等 reg_1 = '[!"#%&\'()*+,-./::;;|=?@,\t—。?★、…【】《》?“”‘’![\\]^_`{|}~]+  ' # 去除空格 reg_2 = '\\s+' text = re.sub(reg_1, ' ', text) text = re.sub(reg_2, '', text) # 去除换行符 text = text.replace('\n', '') text = re.sub(reg_2, '', text) return text def split_data(file_path): df = pd.read_csv(file_path, encoding='utf-8') df.text = df.text.map(text_parse) df['label_id'] = df.label df = df[['text', 'label_id']] df = df.drop(df[df['text'].map(len) <2].index) df = df.drop(df[df['text'].map(len) > 250].index) df.drop_duplicates(inplace=True) # 0.7,0.15,0.15比例划分训练集,测试集,验证集 df = df.sample(frac=1.0) rows, cols = df.shape split_index_1 = int(rows * 0.15) split_index_2 = int(rows * 0.3) # 数据分割 df_test = df.iloc[0:split_index_1, :] df_dev = df.iloc[split_index_1:split_index_2, :] df_train = df.iloc[split_index_2: rows, :] df_test.to_csv('./data/test.txt', sep="\t", index=False, header=None, encoding='utf-8') df_train.to_csv('./data/train.txt', sep="\t", index=False, header=None, encoding='utf-8') df_dev.to_csv('./data/dev.txt', sep="\t", index=False, header=None,encoding='utf-8') return df_test, df_dev, df_train def data_show(file): df = pd.read_csv(file, names=['text', 'label'], sep='\t', encoding='utf-8') len_list = [] for i in df.text: len_list.append(len(i)) import matplotlib.pyplot as plt import numpy as np len_list.sort() plt.bar(np.arange(len(len_list)), len_list) plt.show() if __name__ == '__main__': split_data('../data/other_data/target_label3.csv') # pass