python - 类似字节的对象上的字符串模式
问题描述
import nltk
nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger'])
import re
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
class StartingVerbExtractor(BaseEstimator, TransformerMixin):
def starting_verb(self, text):
sentence_list = nltk.sent_tokenize(text)
for sentence in sentence_list:
pos_tags = nltk.pos_tag(tokenize(sentence))
first_word, first_tag = pos_tags[0]
if first_tag in ['VB', 'VBP'] or first_word == 'RT':
return True
return False
def fit(self, X, y=None):
return self
def transform(self, X):
X_tagged = pd.Series(X).apply(self.starting_verb)
return pd.DataFrame(X_tagged)
def load_data():
df = pd.read_csv('corporate_messaging.csv', encoding='latin-1')
df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]
X = df.text.values
y = df.category.values
return X, y
def tokenize(text):
detected_urls = re.findall(url_regex, text)
for url in detected_urls:
text = text.replace(url, "urlplaceholder")
tokens = word_tokenize(text)
lemmatizer = WordNetLemmatizer()
clean_tokens = []
for tok in tokens:
clean_tok = lemmatizer.lemmatize(tok).lower().strip()
clean_tokens.append(clean_tok)
return clean_tokens
def model_pipeline():
pipeline = Pipeline([
('features', FeatureUnion([
('text_pipeline', Pipeline([
('vect', CountVectorizer(tokenizer=tokenize)),
('tfidf', TfidfTransformer())
])),
('starting_verb', StartingVerbExtractor())
])),
('clf', RandomForestClassifier())
])
return pipeline
def display_results(y_test, y_pred):
labels = np.unique(y_pred)
confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
accuracy = (y_pred == y_test).mean()
print("Labels:", labels)
print("Confusion Matrix:\n", confusion_mat)
print("Accuracy:", accuracy)
def main():
X, y = load_data()
X_train, X_test, y_train, y_test = train_test_split(X, y)
model = model_pipeline()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
display_results(y_test, y_pred)
main()
整个代码没有错误。然而,为了理解Custom Transformer ,我分解def starting_verb()
了class StartingVerbExtractor()
以更好地理解过程中代码中发生的事情。但是当由数据提供时sentence_list = nltk.sent_tokenize(text)
返回。TypeError: cannot use a string pattern on a bytes-like object
X_train
补充信息:在def main()
函数中,如果未使用,则Pipeline
可以看到X_train
已馈入变压器,例如start_verb =StartingVerbExtractor()
X_train_verb = start_verb.fit_transform(X_train)
样本X_train
:
array(['Barclays CEO stresses the importance of regulatory and cultural reform in financial services at Brussels conference ht.tp://t.c.o/Ge9L.p7hpyG','Barclays announces result of Rights Issue ht.tp://t.c,o/LbIq.qh3wwG']
我的问题:为什么代码作为一个整体工作,但单独提供X_train
给标记器sentence_list = nltk.sent_tokenize(text)
返回TypeError: cannot use a string pattern on a bytes-like object
?
解决方案
推荐阅读
- sql - 将存储过程的多个结果集插入临时表
- amazon-web-services - 如何在 RDS MariaDb 中备份数据库
- python - 对具有多个结果的列进行分组和连接
- javascript - 非零正浮点数的正则表达式
- sql-server - 递归查询以使用初始查询中返回的日期作为后续查询的限制
- shopify - 如何让产品页面上的所有应用程序在主页上的特色产品部分工作?
- flutter - 您如何正确地在颤动中全局存储来自 POST 的数据?
- sql - 在 postgresql 中展平子数组
- java - java.lang.NullPointerException,当使用 Selenium 和 cucumber 进行自动化测试时
- php - 我的 Symfony 应用程序在生产中崩溃,因为它找不到 WebProfilerBundle,我怎样才能让它工作?