textcleaning macht immer noch keinen spass
This commit is contained in:
parent
20d9eed5b3
commit
a380b57bfc
2
test.py
2
test.py
|
@ -456,7 +456,7 @@ vllt kategorien in unterkategorien aufteilen
|
|||
|
||||
allg:
|
||||
utf-korregieren, bei sonderzeichen wörter trennen
|
||||
namen raus
|
||||
namen raus, addressen nach grüßen
|
||||
|
||||
emails, urls, nummern raus
|
||||
vllt sogar alles, was ebend jenes enthält (oder auf .toplvldomain bzw. sonderzeichen enthält oder alles was ein @ enthält
|
||||
|
|
85
testo.py
85
testo.py
|
@ -187,6 +187,13 @@ def remove_short_words():
|
|||
def remove_first_names():
|
||||
return lambda tok: tok.lower_ not in [name.lower() for name in VORNAMEN]
|
||||
|
||||
|
||||
|
||||
#falls wort nicht in vocab, erst schauen ob teilwort bekannt ist, falls ja, schauen ob es davor oder danach bullshit stehen hat. ggf trennen
|
||||
|
||||
|
||||
|
||||
|
||||
############# strings
|
||||
|
||||
def stringcleaning(stringstream, funclist):
|
||||
|
@ -196,6 +203,10 @@ def stringcleaning(stringstream, funclist):
|
|||
yield string
|
||||
|
||||
|
||||
def seperate_words_on_regex(regex=regex_specialChars):
|
||||
return lambda string: " ".join(re.compile(regex).split(string))
|
||||
|
||||
|
||||
def remove_words_containing_topLVL():
|
||||
return lambda string: " ".join([w.lower() for w in string.split() if not re.search(regex_topLvl, w) ])
|
||||
|
||||
|
@ -212,10 +223,15 @@ def replacePhonenumbers(replace_with="PHONENUMBER"):
|
|||
return lambda string: textacy.preprocess.replace_phone_numbers(string.lower(), replace_with=replace_with)
|
||||
|
||||
|
||||
def replaceHardS(replace_with="ss"):
|
||||
def replaceSharpS(replace_with="ss"):
|
||||
return lambda string: re.sub(r'[ß]',replace_with,string.lower())
|
||||
|
||||
|
||||
def replaceRockDots():
|
||||
return lambda string: re.sub(r'[ß]', "ss", (re.sub(r'[ö]', "oe", (re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower())))))))
|
||||
|
||||
|
||||
|
||||
def fixUnicode():
|
||||
return lambda string: textacy.preprocess.fix_bad_unicode(string.lower(), normalization=u'NFC')
|
||||
|
||||
|
@ -230,28 +246,36 @@ def lemmatizeWord(word,filepath=LEMMAS):
|
|||
|
||||
def lemmatize():
|
||||
#todo https://alpha.spacy.io/docs/usage/adding-languages#lemmatizer
|
||||
return lambda tok: lemmatizeWord(tok.lower_)
|
||||
return lambda doc: " ".join([lemmatizeWord(tok.lower_) for tok in doc])
|
||||
|
||||
|
||||
def processTextstream(textstream, string_funclist, tok_funclist,parser=DE_PARSER):
|
||||
def processTextstream(textstream, string_funclist, tok_funclist, parser=DE_PARSER, single_doc_func=None):
|
||||
"""
|
||||
:param textstream: string-gen
|
||||
:param funclist: [func]
|
||||
:param parser: spacy-parser
|
||||
:return: string-gen
|
||||
"""
|
||||
#zuerst die string-methoden
|
||||
pipe = parser.pipe(stringcleaning(textstream,string_funclist))
|
||||
|
||||
tokens=[]
|
||||
for doc in pipe:
|
||||
|
||||
tokens = [tok for tok in doc]
|
||||
|
||||
tokens = processTokens(tokens,tok_funclist,parser)
|
||||
#dann die auf tokens
|
||||
tokens = processTokens(tokens,tok_funclist)
|
||||
|
||||
yield " ".join([tok.lower_ for tok in tokens])
|
||||
if single_doc_func is not None:
|
||||
yield single_doc_func(parser(" ".join([tok.lower_ for tok in tokens])))
|
||||
else:
|
||||
yield " ".join([tok.lower_ for tok in tokens])
|
||||
|
||||
|
||||
def processTokens(tokens, funclist, parser):
|
||||
|
||||
|
||||
|
||||
def processTokens(tokens, funclist):
|
||||
# in:tokenlist, funclist
|
||||
# out: tokenlist
|
||||
for f in funclist:
|
||||
|
@ -261,25 +285,33 @@ def processTokens(tokens, funclist, parser):
|
|||
return tokens
|
||||
|
||||
|
||||
|
||||
|
||||
string_comp=[
|
||||
replaceHardS(),
|
||||
fixUnicode(),
|
||||
replaceRockDots(),
|
||||
remove_words_containing_topLVL(),
|
||||
replaceSpecialChars(),
|
||||
seperate_words_on_regex()
|
||||
]
|
||||
|
||||
tok_comp=[
|
||||
removeENT(["PERSON"]),
|
||||
#removeENT(["PERSON"]),
|
||||
remove_words_containing_Numbers(),
|
||||
#keepPOS(["NOUN"]),
|
||||
removePOS(["PUNCT","SPACE","NUM"]),
|
||||
removeWords(de_stop_words),
|
||||
|
||||
remove_long_words(),
|
||||
remove_short_words(),
|
||||
|
||||
remove_first_names()
|
||||
remove_first_names(),
|
||||
|
||||
#keepPOS(["NOUN"]),
|
||||
|
||||
]
|
||||
|
||||
single_doc_func = lemmatize()
|
||||
|
||||
|
||||
|
||||
|
||||
"""
|
||||
|
@ -332,7 +364,7 @@ ticketcorpus = textacy.Corpus(DE_PARSER)
|
|||
## add files to textacy-corpus,
|
||||
printlog("add texts to textacy-corpus")
|
||||
ticketcorpus.add_texts(
|
||||
processTextstream(csv_to_contentStream(path2csv,"Description"), string_funclist=string_comp,tok_funclist=tok_comp)
|
||||
processTextstream(csv_to_contentStream(path2csv,"Description"), string_funclist=string_comp, tok_funclist=tok_comp, single_doc_func=single_doc_func)
|
||||
)
|
||||
|
||||
for i in range(10):
|
||||
|
@ -362,33 +394,6 @@ for i in range(10):
|
|||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
words = [
|
||||
"uniaccount",
|
||||
"nr54065467",
|
||||
"nr54065467",
|
||||
"455a33c5,"
|
||||
"tvt?=",
|
||||
"tanja.saborowski@tu-dortmund.de",
|
||||
"-",
|
||||
"m-sw1-vl4053.itmc.tu-dortmund.de",
|
||||
"------problem--------"
|
||||
]
|
||||
|
||||
topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE)
|
||||
specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE)
|
||||
|
||||
for w in words:
|
||||
print(stringcleaning(w,string_comp))
|
||||
#print(bool(re.search(r'\.[a-z]{2,3}(\.[a-z]{2,3})?',w)))
|
||||
#print(bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]',w)))
|
||||
#result = specialFinder.sub(" ", w)
|
||||
#print(re.sub(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]'," ",w))
|
||||
|
||||
#print(re.sub(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', " ", w))
|
||||
|
||||
"""
|
||||
|
||||
|
|
|
@ -0,0 +1,93 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import corenlp as corenlp
|
||||
import os
|
||||
import re
|
||||
import textacy
|
||||
import nltk
|
||||
from textblob_de import TextBlobDE
|
||||
from textblob_de import PatternParser
|
||||
|
||||
filepath = "lemmatization-de.txt"
|
||||
|
||||
|
||||
|
||||
|
||||
blob = TextBlobDE(str(textacy.fileio.read_file("teststring.txt")),parser=PatternParser(pprint=True, lemmata=True))
|
||||
|
||||
print(blob.parse())
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#erste spalte zu {lemma : id} . zweite spalte zu {word : id}
|
||||
|
||||
|
||||
|
||||
"""http://www.lexiconista.com/datasets/lemmatization/"""
|
||||
|
||||
lemma2id = {}
|
||||
word2id = {}
|
||||
|
||||
for id,line in enumerate(list(textacy.fileio.read_file_lines(filepath=filepath))):
|
||||
|
||||
lemma = line.split()[0].strip().lower()
|
||||
if lemma not in lemma2id:
|
||||
lemma2id[lemma] = id
|
||||
|
||||
word = line.split()[1].strip().lower()
|
||||
|
||||
word2id[word] = lemma2id[word]
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
"""
|
||||
regex_specialChars = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|</>?]'
|
||||
|
||||
|
||||
def stringcleaning(stringstream, funclist):
|
||||
for string in stringstream:
|
||||
for f in funclist:
|
||||
string = f(string)
|
||||
yield string
|
||||
|
||||
|
||||
def seperate_words_on_regex(regex=regex_specialChars):
|
||||
return lambda string: " ".join(re.compile(regex).split(string))
|
||||
|
||||
|
||||
words = [
|
||||
"uniaccount",
|
||||
"nr54065467",
|
||||
"nr54065467",
|
||||
"455a33c5,"
|
||||
"tvt?=",
|
||||
"tanja.saborowski@tu-dortmund.de",
|
||||
"-",
|
||||
"m-sw1-vl4053.itmc.tu-dortmund.de",
|
||||
"------problem--------"
|
||||
]
|
||||
|
||||
|
||||
|
||||
topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE)
|
||||
specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE)
|
||||
|
||||
for s in stringcleaning((w for w in words),[seperate_words_on_regex()]):
|
||||
print(s.strip())
|
||||
|
||||
#print(stringcleaning(w,string_comp))
|
||||
#print(bool(re.search(r'\.[a-z]{2,3}(\.[a-z]{2,3})?',w)))
|
||||
#print(bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]',w)))
|
||||
#result = specialFinder.sub(" ", w)
|
||||
#print(re.sub(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]'," ",w))
|
||||
|
||||
#print(re.sub(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', " ", w))
|
||||
"""
|
Loading…
Reference in New Issue