这篇教程新加列的详细过程写得很实用,希望能帮到您。 # -*- coding: utf-8 -*- import nltk import csv from rake_nltk import Rake import pandas as pd from imblearn.over_sampling import BorderlineSMOTE from transformers import TFBertModel, BertTokenizer seed_value = 29 import os os.environ['PYTHONHASHSEED'] = str(seed_value) import random random.seed(seed_value) import numpy as np np.random.seed(seed_value) np.set_printoptions(precision=2) import tensorflow as tf from tensorflow import keras tf.random.set_seed(seed_value) import tensorflow_addons as tfa import tensorflow.keras as keras import tensorflow.keras.layers as layers from tensorflow.keras.callbacks import ModelCheckpoint import re import sklearn import matplotlib.pyplot as plt from sklearn.metrics import auc, roc_curve import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True)
device_name = tf.test.gpu_device_name() if device_name != '/device:GPU:0': raise SystemError('GPU device not found') print('Found GPU at: {}'.format(device_name))
# Uses stopwords for english from NLTK, and all puntuation characters by # default
# 停用词集合 def text_stopwords(text): stop_words = set(stopwords.words('english')) # 分词 word_tokens = word_tokenize(text) filtered_sentence = [] for w in word_tokens:
if w not in stop_words: filtered_sentence.append(w) separator = ' ' text = separator.join(filtered_sentence) return text #print(text_stopwords(text1))
def text_preprocessing(text): text = text.lower() text = re.sub('\[.*?\]', '', text) text = re.sub('https?://\S+|www\.\S+', '', text) text = re.sub('<.*?>+', '', text) text = re.sub('\n', '', text) #text = re.sub('|||', '', text) text = re.sub('\w*\d\w*', '', text) text.encode('ascii', 'ignore').decode('ascii') if text.startswith("'"): text = text[1:-1]
text = text_stopwords(text) return text r = Rake()
df = pd.read_csv("mbti_1.csv") row=0 xx=[] for row in range(8675):
content1 = df.loc[row, "content"]
content1 = text_preprocessing(content1)
#print(content1) # 提取第二列数据
# Extraction given the text. r.extract_keywords_from_text(content1)
# Extraction given the list of strings where each string is a sentence. #r.extract_keywords_from_sentences(<list of sentences>)
# To get keyword phrases ranked highest to lowest. m = r.get_ranked_phrases()
x = m[:120] #print(x)
x2 = ''.join(x) #print(x2)
xx.append(x2) #print(xx)
df['topics'] =xx #print('插入d列:\n', df) df.to_csv("mbti_1z.csv", index=False) # To get keyword phrases ranked highest to lowest with scores. #r.get_ranked_phrases_with_scores() #s_data1 = pd.Series(xx) #print(s_data1) 返回列表 Pandas如何将数据处理后保存csv文件 |