python - python中的numpy.str_错误预处理文本
问题描述
我正在尝试使用以下代码预处理 pandas 数据框中的列中的一些文本。预处理包括小写、标记和删除停用词。但是,我收到以下numpy.str_ object is not callable
错误。知道我该如何解决这个问题吗?谢谢。
import nltk
import numpy as np
import re
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import string
i = nltk.corpus.stopwords.words('english')
j = list(string.punctuation)
stopwords = set(i).union(j)
def preprocess(x):
x = re.sub('[^a-z\s]', '', x().lower()) # get rid of noise
x = [w for w in x.split() if w not in set(stopwords) if len(w) > 3] # remove stopwords
return ' '.join(x)
preprocess2 = np.vectorize(preprocess)
merged_sort['processed_text'] = merged_sort['translated_text'].apply(preprocess2)
错误
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-22-e3eee761f15b> in <module>
----> 1 merged_sort['processed_text'] = merged_sort['translated_text'].apply(preprocess2)
~\Anaconda3\lib\site-packages\pandas\core\series.py in apply(self, func, convert_dtype, args, **kwds)
3846 else:
3847 values = self.astype(object).values
-> 3848 mapped = lib.map_infer(values, f, convert=convert_dtype)
3849
3850 if len(mapped) and isinstance(mapped[0], Series):
pandas\_libs\lib.pyx in pandas._libs.lib.map_infer()
~\Anaconda3\lib\site-packages\numpy\lib\function_base.py in __call__(self, *args, **kwargs)
2089 vargs.extend([kwargs[_n] for _n in names])
2090
-> 2091 return self._vectorize_call(func=func, args=vargs)
2092
2093 def _get_ufunc_and_otypes(self, func, args):
~\Anaconda3\lib\site-packages\numpy\lib\function_base.py in _vectorize_call(self, func, args)
2159 res = func()
2160 else:
-> 2161 ufunc, otypes = self._get_ufunc_and_otypes(func=func, args=args)
2162
2163 # Convert args to object arrays first
~\Anaconda3\lib\site-packages\numpy\lib\function_base.py in _get_ufunc_and_otypes(self, func, args)
2119
2120 inputs = [arg.flat[0] for arg in args]
-> 2121 outputs = func(*inputs)
2122
2123 # Performance note: profiling indicates that -- for simple
<ipython-input-21-1631cf3dded9> in preprocess(x)
12
13 def preprocess(x):
---> 14 x = re.sub('[^a-z\s]', '', x().lower()) # get rid of noise
15 x = [w for w in x.split() if w not in set(stopwords) if len(w) > 3] # remove stopwords
16 return ' '.join(x)
TypeError: 'numpy.str_' object is not callable
解决方案
推荐阅读
- c++ - 当我输入较大的数字时,为什么我的程序会到达文件末尾?
- loadrunner - 在 Loadrunner 脚本中附加文件时,不会在脚本中捕获文件名或位置路径
- python - pd.Series 的排列导致就地更改
- javascript - 将 react-color 组件插入到 react+d3 教程中
- r - 加速矩阵运算
- android - Android Emulator 无法在 react-native run-android 中运行我的应用
- apache-spark - 关闭分区发现 Spark
- javascript - 类对象方法声明React之间的区别?
- laravel - Laravel 关闭请求
- c# - 是什么导致了这个 System.InvalidCastException?