liukangjia
/
text_to_vector


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859
							# coding:utf-8
import pandas as pd
import re


def text_parse(text):
    # 正则过滤掉特殊符号，标点，英文，数字等
    reg_1 = '[!"#%&\'()*+,-./:：;；|=?@，\t—。?★、…【】《》？“”‘’！[\\]^_`{|}~]+  '
    # 去除空格
    reg_2 = '\\s+'
    text = re.sub(reg_1, ' ', text)
    text = re.sub(reg_2, '', text)
    # 去除换行符
    text = text.replace('\n', '')
    text = re.sub(reg_2, '', text)
    return text


def split_data(file_path):
    df = pd.read_csv(file_path, encoding='utf-8')
    df.text = df.text.map(text_parse)
    df['label_id'] = df.label
    df = df[['text', 'label_id']]
    df = df.drop(df[df['text'].map(len) <2].index)
    df = df.drop(df[df['text'].map(len) > 250].index)
    df.drop_duplicates(inplace=True)
    # 0.7,0.15,0.15比例划分训练集，测试集，验证集
    df = df.sample(frac=1.0)
    rows, cols = df.shape
    split_index_1 = int(rows * 0.15)
    split_index_2 = int(rows * 0.3)

    # 数据分割
    df_test = df.iloc[0:split_index_1, :]
    df_dev = df.iloc[split_index_1:split_index_2, :]
    df_train = df.iloc[split_index_2: rows, :]

    df_test.to_csv('./data/test.txt', sep="\t", index=False, header=None, encoding='utf-8')
    df_train.to_csv('./data/train.txt', sep="\t", index=False, header=None, encoding='utf-8')
    df_dev.to_csv('./data/dev.txt', sep="\t", index=False, header=None,encoding='utf-8')
    return df_test, df_dev, df_train


def data_show(file):
    df = pd.read_csv(file, names=['text', 'label'], sep='\t', encoding='utf-8')
    len_list = []

    for i in df.text:
        len_list.append(len(i))
    import matplotlib.pyplot as plt
    import numpy as np
    len_list.sort()
    plt.bar(np.arange(len(len_list)), len_list)
    plt.show()


if __name__ == '__main__':
    split_data('../data/other_data/target_label3.csv')
    # pass