处理MR数据集为{'token''label'}

从text_gcn中拿数据集
标签单独放开一个文件，每行有一个数字标签，0和1
原文说因为文本实在是太短了，所以没有去停用词
注意：MR使用Latin1编码！！！
查看编码格式
vim text_train.txt
命令框中输入 :set fileencoding
#!/usr/bin/env python
# coding: utf-8
# TREC, R8, R52, WebKB
import re
import tqdm
import json
import random

# english_stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll",
#                      "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's",
#                      'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs',
#                      'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am',
#                      'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does',
#                      'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while',
#                      'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during',
#                      'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off',
#                      'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
#                      'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor',
#                      'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don',
#                      "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren',
#                      "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn',
#                      "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't",
#                      'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't",
#                      'won', "won't", 'wouldn', "wouldn't", '\\.', '\\?', ',', '\\!', "'s", '']
# english_stopwords = ['\\.', '\\?', ',', '\\!', "'s", '']


def clean_stopwords(sample):
    """
    :param sample: List[Str], lower case
    :return:  List[Str]
    """
    return [token for token in sample if token not in english_stopwords]


def clean_str(string):
    """
    Original Source:  https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    :param string: Str
    :return -> Str
    """
    string = string.strip().strip('"')
    string = re.sub(r"[^A-Za-z(),!?\.\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"\.", " \. ", string)
    string = re.sub(r"\"", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()


def preprocess_line(sample):
    """
    :param sample: Str, "The sample would be tokenized and filtered according to the stopwords list"
    :return: token_list -> List[Str]
    """
    sample = clean_str(sample.lstrip().rstrip())
    token_list = sample.split(' ')
    # token_list = clean_stopwords(sample.split(' '))
    return token_list

def readData(data_path):
    doc_content_list = []
    f = open(data_path, 'r', encoding="latin1") # mr是latin1！！
    lines = f.readlines()
    for line in lines:
        doc_content_list.append(line.strip())
    f.close()
    len_doc_content_list = len(doc_content_list)
    print("read %d lines from %s" % (len_doc_content_list, data_path))
    return doc_content_list, len_doc_content_list


def preprocess_raw_file_1(token_path, label_path):
# def preprocess_raw_file(token_list, label_list):
    """
    :return: List[Dict{'token': List[Str], 'label': []}]
    """
    token_list = []
    f = pen(token_path, 'r', encoding="latin1")  # mr是latin1！！
    lines = f.readlines()
    for line in lines:
        token_list.append(preprocess_line(line.strip()))
    f.close()

    label_list = []
    f = open(label_path, 'r', encoding="latin1")  # mr是latin1！！
    lines = f.readlines()
    for line in lines:
        label_list.append(line.strip())
    f.close()


    corpus_data = list()
    for i in range(len(label_list)):
        label = label_list[i]
        sample_tokens = token_list[i]
        # print("sample_tokens", sample_tokens)
        # corpus_data.append(json.dumps({'token': sample_tokens, 'label': [label]}))
        corpus_data.append(json.dumps({'doc_label': [label], 'doc_token': sample_tokens, 'doc_keyword':[], 'doc_topic':[]}))
    print('The number of samples: {}'.format(len(corpus_data)))
    return corpus_data


def preprocess_raw_file(token_list, label_list):
    """
    :return: List[Dict{'token': List[Str], 'label': []}]
    """
    corpus_data = list()
    for i in range(len(label_list)):
        label = label_list[i]
        sample_tokens = preprocess_line(token_list[i])
        # print("sample_tokens", sample_tokens)
        # corpus_data.append(json.dumps({'token': sample_tokens, 'label': [label]}))
        corpus_data.append(json.dumps({'doc_label': [label], 'doc_token': sample_tokens, 'doc_keyword':[], 'doc_topic':[]}))
    print('The number of samples: {}'.format(len(corpus_data)))
    return corpus_data

def train_val_file(corpus_data, train_path, val_path):
    len_corpus = len(corpus_data)
    len_0_9 = int(len_corpus*0.9)
    random.shuffle(corpus_data)
    train = '\n'.join(corpus_data[:len_0_9])
    val = '\n'.join(corpus_data[len_0_9:])
    with open(train_path, 'w') as f:
        f.write(train+'\n')
    with open(val_path, 'w') as f:
        f.write(val+'\n')
    print('process train val file')

def test_file(corpus_data, test_path):
    write_corpus_data = '\n'.join(corpus_data)
    with open(test_path, 'w') as f:
        f.write(write_corpus_data)
    print('process test file')


def load_processed_file(file_path): # 暂时没用
    """
    :param file_path: Str, file path of the processed file
    :return: List[Dict{'token': List[Str], 'label': []}]
    """
    corpus_data = list()
    raw_data = list()
    # sample_dict = {'token': [], 'label': []}
    print('Loading raw data in {}'.format(file_path))
    with open(file_path, 'r') as f:
        for line in tqdm.tqdm(f):
            raw_data.append(json.loads(line.rstrip()))
            corpus_data.append(line.rstrip())
    print('The number of samples: {}'.format(len(corpus_data)))
    return raw_data, corpus_data


if __name__ == '__main__':
    random.seed(24)

    # label为单独一列的文件，text为以空格分开的raw文本
    dataset = 'mr'
    train_path = '../data/' + dataset + '/text_train.txt'
    train_label_path = '../data/' + dataset + '/label_train.txt'
    test_path = '../data/' + dataset + '/text_test.txt'
    test_label_path = '../data/' + dataset + '/label_test.txt'
    write_train_path = '../data/' + dataset + '_train.json'
    write_val_path = '../data/' + dataset + '_val.json'
    write_test_path = '../data/' + dataset + '_test.json'

    # corpus_data = preprocess_raw_file_1(train_path, train_label_path)
    # train_val_file(corpus_data, write_train_path, write_val_path) # 划分验证集




    train_list, len_train = readData(train_path)
    # train_label_list, _ = readData(train_label_path)
    # corpus_data = preprocess_raw_file(train_list, train_label_list)
    # train_val_file(corpus_data, write_train_path, write_val_path) # 划分验证集
    # test_file(corpus_data, write_train_path) # 不划分验证集


    test_list, len_test = readData(test_path)
    # test_label_list, _ = readData(test_label_path)
    # corpus_data = preprocess_raw_file(test_list, test_label_list)
    # test_file(corpus_data, write_test_path)

    print(len_train) # 7108
    print(len_test) # 3554
Python 数据集
处理MR数据集为{'token''label'}

相关

学习《Python编程从入门到实践》PDF+代码训练

python-----面向对象简单理解

python多线程控制

Sublime 的安装、汉化、配置、Python环境和插件

python——time strftime() 函数表示当地时间

python 初识函数

python 函数对象嵌套闭包

Python栈溢出——设置python栈大小

python-面向对象-01课堂笔记

python爬虫

Python 之父的解析器系列之五：左递归 PEG 语法

Python 为了提升性能，竟运用了共享经济

标签