处理R8、R52数据集为{'token''label'}
#!/usr/bin/env python # coding: utf-8 # TREC, R8, R52, WebKB import re import tqdm import json import random english_stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't", '\\.', '\\?', ',', '\\!', "'s", ''] def clean_stopwords(sample): """ :param sample: List[Str], lower case :return: List[Str] """ return [token for token in sample if token not in english_stopwords] def clean_str(string): """ Original Source: https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py :param string: Str :return -> Str """ string = string.strip().strip('"') string = re.sub(r"[^A-Za-z(),!?\.\'\`]", " ", string) string = re.sub(r"\'s", " \'s", string) string = re.sub(r"\'ve", " \'ve", string) string = re.sub(r"n\'t", " n\'t", string) string = re.sub(r"\'re", " \'re", string) string = re.sub(r"\'d", " \'d", string) string = re.sub(r"\'ll", " \'ll", string) string = re.sub(r",", " , ", string) string = re.sub(r"\.", " \. ", string) string = re.sub(r"\"", " , ", string) string = re.sub(r"!", " ! ", string) string = re.sub(r"\(", " \( ", string) string = re.sub(r"\)", " \) ", string) string = re.sub(r"\?", " \? ", string) string = re.sub(r"\s{2,}", " ", string) return string.strip().lower() def preprocess_line(sample): """ :param sample: Str, "The sample would be tokenized and filtered according to the stopwords list" :return: token_list -> List[Str] """ sample = clean_str(sample.lstrip().rstrip()) token_list = clean_stopwords(sample.split(' ')) return token_list def preprocess_raw_file(file_path): """ :param file_path: Str, file path of the raw file :return: List[Dict{'token': List[Str], 'label': []}] """ corpus_data = list() raw_data = list() # sample_dict = {'token': [], 'label': []} print('Loading and Preprocessing raw data in {}'.format(file_path)) with open(file_path, 'r') as f: for line in tqdm.tqdm(f): line = line.strip() label = line[:line.find('\t')] content = line[line.find('\t') + 1:] sample_tokens = preprocess_line(content) raw_data.append({'token': content.rstrip(), 'label': [label]}) # corpus_data.append(json.dumps({'token': sample_tokens, 'label': [label]})) corpus_data.append(json.dumps({'doc_label': [label], 'doc_token': sample_tokens, 'doc_keyword':[], 'doc_topic':[]})) print('The number of samples: {}'.format(len(corpus_data))) return raw_data, corpus_data def train_val_file(corpus_data, train_path, val_path): len_corpus = len(corpus_data) len_0_9 = int(len_corpus*0.9) random.shuffle(corpus_data) train = '\n'.join(corpus_data[:len_0_9]) val = '\n'.join(corpus_data[len_0_9:]) with open(train_path, 'w') as f: f.write(train+'\n') with open(val_path, 'w') as f: f.write(val+'\n') print('process train val file') def test_file(corpus_data, test_path): write_corpus_data = '\n'.join(corpus_data) with open(test_path, 'w') as f: f.write(write_corpus_data) print('process test file') def load_processed_file(file_path): # 暂时没用 """ :param file_path: Str, file path of the processed file :return: List[Dict{'token': List[Str], 'label': []}] """ corpus_data = list() raw_data = list() # sample_dict = {'token': [], 'label': []} print('Loading raw data in {}'.format(file_path)) with open(file_path, 'r') as f: for line in tqdm.tqdm(f): raw_data.append(json.loads(line.rstrip())) corpus_data.append(line.rstrip()) print('The number of samples: {}'.format(len(corpus_data))) return raw_data, corpus_data if __name__ == '__main__': # TREC, R8, R52, WebKB dataset = 'R52' train_path = '../data/' + dataset + '/train.txt' test_path = '../data/' + dataset + '/test.txt' write_train_path = '../data/' + dataset + '_train.json' write_val_path = '../data/' + dataset + '_val.json' write_test_path = '../data/' + dataset + '_test.json' raw_data, corpus_data = preprocess_raw_file(train_path) train_val_file(corpus_data, write_train_path, write_val_path) # 划分验证集 # test_file(corpus_data, write_train_path) # 不划分验证集 raw_data, corpus_data = preprocess_raw_file(test_path) test_file(corpus_data, write_test_path)
改main中dataset
数据集从TextGCN中获取,将用于NeuralNLP-NeuralClassifier项目使用
原格式为:
每行一个label,tab分隔存放raw text
转换格式为
{"token": ["caesars", "world", "says", "considers", "restructuring", "sale", "company"], "label": ["acq"]}
{
"doc_label": ["Computer--MachineLearning--DeepLearning", "Neuro--ComputationalNeuro"],
"doc_token": ["I", "love", "deep", "learning"],
"doc_keyword": ["deep learning"],
"doc_topic": ["AI", "Machine learning"]
}
"doc_keyword" and "doc_topic" are optional.