问题
I have a Wiki Dump as xml.bz2 file and want to convert it to txt for later processing with BERT. The goal is to have each separate sentence in a new line and an emptly line between articles (requirements of BERT Training)
I tried to follow this (How to get the wikipedia corpus text with punctuation by using gensim wikicorpus?) Post and did a lot of research of my own. This is what i got so far:
from __future__ import print_function
import sys
from gensim.corpora import WikiCorpus
from wikicorpus import *
import six
def tokenize(content):
#override original method in wikicorpus.py
return [token.encode('utf8') for token in content.split()
if len(token) <= 15 and not token.startswith('_')]
def process_article(args):
# override original method in wikicorpus.py
text, lemmatize, title, pageid = args
text = filter_wiki(text)
if lemmatize:
result = utils.lemmatize(text)
else:
result = tokenize(text)
return result, title, pageid
class MyWikiCorpus(WikiCorpus):
def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None, filter_namespaces=('0',)):
WikiCorpus.__init__(self, fname, processes, lemmatize, dictionary, filter_namespaces)
def get_texts(self):
articles, articles_all = 0, 0
positions, positions_all = 0, 0
texts = ((text, self.lemmatize, title, pageid) for title, text, pageid in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces))
pool = multiprocessing.Pool(self.processes)
for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1):
for tokens, title, pageid in pool.imap(process_article, group): # chunksize=10):
articles_all += 1
positions_all += len(tokens)
if len(tokens) < ARTICLE_MIN_WORDS or any(title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES):
continue
articles += 1
positions += len(tokens)
if self.metadata:
yield (tokens, (pageid, title))
else:
yield tokens
pool.terminate()
print(
"finished iterating over Wikipedia corpus of %i documents with %i positions"
" (total %i articles, %i positions before pruning articles shorter than %i words)",
articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS)
self.length = articles # cache corpus length
I used the Post above to override the functions and finally called the class like so:
def make_corpus2(inp, outp):
space = " "
i = 0
output = open(outp, 'w')
wiki = MyWikiCorpus(inp, lemmatize=False, dictionary={})
for text in wiki.get_texts():
if six.PY3:
output.write(bytes(' '.join(text), 'utf-8').decode('utf-8') + '\n')
else:
output.write(space.join(text) + "\n")
i = i + 1
if (i % 10000 == 0):
print("Saved " + str(i) + " articles")
output.close()
print("Finished Saved " + str(i) + " articles")
and called it with make_corpus2("./Wiki_dump_gross.xml.bz2", "./pretrain/wiki_dump_sentences.txt")
There is no Error and it filles the output file, but punctuations are still missing. I felt like i incorprated the given solution from the previous post, so i was wondering where my mistake could be. To clarify: I used Jupyter Notebook for this.
Example of the Output i get:
der begriff heilkunde bezeichnet die gesamtheit der menschlichen kenntnisse und fähigkeiten über die
entstehung heilung und verhinderung prävention von krankheiten er wird als synonym für medizin im
allgemeinen aber auch innerhalb der der volksheilkunde und jeder form der psychotherapie verwendet
ausübung einer heilkunde die ausübung einer heilkunde genannt auch heilkunst ist in deutschland
österreich und der schweiz rechtlich unterschiedlich geregelt
Also i was wondering if it is possible to keep the casing for the text, since in German this is a relevant part of the language.
来源:https://stackoverflow.com/questions/65564423/keep-punctuation-and-casing-in-gensim-wikicorpus-text