首页 > 解决方案 > gensim taggeddocument 列表生成 TypeError: 'NoneType' object is not iterable

问题描述

好的,我想围绕Doc2Vec课程进行总结,但由于某些原因,我事后不完全理解,它不包含我的标记文档列表:

这就是我正在尝试做的事情:

class doc2vec_model(Doc2Vec):
    
    def train(self,path):
        self._path = path
        self._tagged_documents()

        super(Doc2Vec, self).__init__(self._docs, min_count = 100, 
                                     vector_size=300, 
                                     epochs = 20, 
                                     negative = 5, 
                                     workers=20, 
                                     sample = 1e-5,
                                     alpha=0.01,
                                     min_alpha=0.0001)

    def _tagged_documents(self,):
        self.file_l = [name for name in glob.iglob(self._path, recursive=True)]
        self._docs = [] 
        for f_id, path in enumerate(self.file_l):
            with open(path,'r') as f:
                docu = f.read()
                docu = norm_string(docu)
                docu = docu.split(' ')
                chunk_size = 200 
                chunk_l = [docu[i:i+chunk_size] for i in range(0,len(docu),chunk_size)]
                for c_id, docu_chunk in enumerate(chunk_l):
                    self._docs.append(TaggedDocument(words=docu_chunk, tags=(f'DOC_{f_id}_{c_id}',)))

然后调用它:

model = doc2vec_model()
model.train('/path/to/my/docs/*')

我得到的错误如下:

~/.local/lib/python3.6/site-packages/gensim/models/base_any2vec.py in __init__(self, sentences, corpus_file, workers, vector_size, epochs, callbacks, batch_words, trim_rule, sg, alpha, window, seed, hs, negative, ns_exponent, cbow_mean, min_alpha, compute_loss, **kwargs)
    743                 raise TypeError("You can't pass a generator as the sentences argument. Try a sequence.")
    744 
--> 745             self.build_vocab(sentences=sentences, corpus_file=corpus_file, trim_rule=trim_rule)
    746             self.train(
    747                 sentences=sentences, corpus_file=corpus_file, total_examples=self.corpus_count,

~/.local/lib/python3.6/site-packages/gensim/models/doc2vec.py in build_vocab(self, documents, corpus_file, update, progress_per, keep_raw_vocab, trim_rule, **kwargs)
    926         total_words, corpus_count = self.vocabulary.scan_vocab(
    927             documents=documents, corpus_file=corpus_file, docvecs=self.docvecs,
--> 928             progress_per=progress_per, trim_rule=trim_rule
    929         )
    930         self.corpus_count = corpus_count

~/.local/lib/python3.6/site-packages/gensim/models/doc2vec.py in scan_vocab(self, documents, corpus_file, docvecs, progress_per, trim_rule)
   1123             documents = TaggedLineDocument(corpus_file)
   1124 
-> 1125         total_words, corpus_count = self._scan_vocab(documents, docvecs, progress_per, trim_rule)
   1126 
   1127         logger.info(

~/.local/lib/python3.6/site-packages/gensim/models/doc2vec.py in _scan_vocab(self, documents, docvecs, progress_per, trim_rule)
   1050         checked_string_types = 0
   1051         vocab = defaultdict(int)
-> 1052         for document_no, document in enumerate(documents):
   1053             if not checked_string_types:
   1054                 if isinstance(document.words, string_types):

TypeError: 'NoneType' object is not iterable

而且我不明白为什么当我将它传递给of时self._docs列表中充满了TaggedDocument对象,所以它应该是可迭代的。.__init__Dov2Vec

标签: pythonpython-3.xgensim

解决方案


推荐阅读