textcleaning macht immer noch keinen spass
This commit is contained in:
parent
20d9eed5b3
commit
a380b57bfc
2
test.py
2
test.py
|
@ -456,7 +456,7 @@ vllt kategorien in unterkategorien aufteilen
|
||||||
|
|
||||||
allg:
|
allg:
|
||||||
utf-korregieren, bei sonderzeichen wörter trennen
|
utf-korregieren, bei sonderzeichen wörter trennen
|
||||||
namen raus
|
namen raus, addressen nach grüßen
|
||||||
|
|
||||||
emails, urls, nummern raus
|
emails, urls, nummern raus
|
||||||
vllt sogar alles, was ebend jenes enthält (oder auf .toplvldomain bzw. sonderzeichen enthält oder alles was ein @ enthält
|
vllt sogar alles, was ebend jenes enthält (oder auf .toplvldomain bzw. sonderzeichen enthält oder alles was ein @ enthält
|
||||||
|
|
83
testo.py
83
testo.py
|
@ -187,6 +187,13 @@ def remove_short_words():
|
||||||
def remove_first_names():
|
def remove_first_names():
|
||||||
return lambda tok: tok.lower_ not in [name.lower() for name in VORNAMEN]
|
return lambda tok: tok.lower_ not in [name.lower() for name in VORNAMEN]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#falls wort nicht in vocab, erst schauen ob teilwort bekannt ist, falls ja, schauen ob es davor oder danach bullshit stehen hat. ggf trennen
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
############# strings
|
############# strings
|
||||||
|
|
||||||
def stringcleaning(stringstream, funclist):
|
def stringcleaning(stringstream, funclist):
|
||||||
|
@ -196,6 +203,10 @@ def stringcleaning(stringstream, funclist):
|
||||||
yield string
|
yield string
|
||||||
|
|
||||||
|
|
||||||
|
def seperate_words_on_regex(regex=regex_specialChars):
|
||||||
|
return lambda string: " ".join(re.compile(regex).split(string))
|
||||||
|
|
||||||
|
|
||||||
def remove_words_containing_topLVL():
|
def remove_words_containing_topLVL():
|
||||||
return lambda string: " ".join([w.lower() for w in string.split() if not re.search(regex_topLvl, w) ])
|
return lambda string: " ".join([w.lower() for w in string.split() if not re.search(regex_topLvl, w) ])
|
||||||
|
|
||||||
|
@ -212,10 +223,15 @@ def replacePhonenumbers(replace_with="PHONENUMBER"):
|
||||||
return lambda string: textacy.preprocess.replace_phone_numbers(string.lower(), replace_with=replace_with)
|
return lambda string: textacy.preprocess.replace_phone_numbers(string.lower(), replace_with=replace_with)
|
||||||
|
|
||||||
|
|
||||||
def replaceHardS(replace_with="ss"):
|
def replaceSharpS(replace_with="ss"):
|
||||||
return lambda string: re.sub(r'[ß]',replace_with,string.lower())
|
return lambda string: re.sub(r'[ß]',replace_with,string.lower())
|
||||||
|
|
||||||
|
|
||||||
|
def replaceRockDots():
|
||||||
|
return lambda string: re.sub(r'[ß]', "ss", (re.sub(r'[ö]', "oe", (re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower())))))))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def fixUnicode():
|
def fixUnicode():
|
||||||
return lambda string: textacy.preprocess.fix_bad_unicode(string.lower(), normalization=u'NFC')
|
return lambda string: textacy.preprocess.fix_bad_unicode(string.lower(), normalization=u'NFC')
|
||||||
|
|
||||||
|
@ -230,28 +246,36 @@ def lemmatizeWord(word,filepath=LEMMAS):
|
||||||
|
|
||||||
def lemmatize():
|
def lemmatize():
|
||||||
#todo https://alpha.spacy.io/docs/usage/adding-languages#lemmatizer
|
#todo https://alpha.spacy.io/docs/usage/adding-languages#lemmatizer
|
||||||
return lambda tok: lemmatizeWord(tok.lower_)
|
return lambda doc: " ".join([lemmatizeWord(tok.lower_) for tok in doc])
|
||||||
|
|
||||||
|
|
||||||
def processTextstream(textstream, string_funclist, tok_funclist,parser=DE_PARSER):
|
def processTextstream(textstream, string_funclist, tok_funclist, parser=DE_PARSER, single_doc_func=None):
|
||||||
"""
|
"""
|
||||||
:param textstream: string-gen
|
:param textstream: string-gen
|
||||||
:param funclist: [func]
|
:param funclist: [func]
|
||||||
:param parser: spacy-parser
|
:param parser: spacy-parser
|
||||||
:return: string-gen
|
:return: string-gen
|
||||||
"""
|
"""
|
||||||
|
#zuerst die string-methoden
|
||||||
pipe = parser.pipe(stringcleaning(textstream,string_funclist))
|
pipe = parser.pipe(stringcleaning(textstream,string_funclist))
|
||||||
|
tokens=[]
|
||||||
for doc in pipe:
|
for doc in pipe:
|
||||||
|
|
||||||
tokens = [tok for tok in doc]
|
tokens = [tok for tok in doc]
|
||||||
|
|
||||||
tokens = processTokens(tokens,tok_funclist,parser)
|
#dann die auf tokens
|
||||||
|
tokens = processTokens(tokens,tok_funclist)
|
||||||
|
|
||||||
|
if single_doc_func is not None:
|
||||||
|
yield single_doc_func(parser(" ".join([tok.lower_ for tok in tokens])))
|
||||||
|
else:
|
||||||
yield " ".join([tok.lower_ for tok in tokens])
|
yield " ".join([tok.lower_ for tok in tokens])
|
||||||
|
|
||||||
|
|
||||||
def processTokens(tokens, funclist, parser):
|
|
||||||
|
|
||||||
|
|
||||||
|
def processTokens(tokens, funclist):
|
||||||
# in:tokenlist, funclist
|
# in:tokenlist, funclist
|
||||||
# out: tokenlist
|
# out: tokenlist
|
||||||
for f in funclist:
|
for f in funclist:
|
||||||
|
@ -261,25 +285,33 @@ def processTokens(tokens, funclist, parser):
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
string_comp=[
|
string_comp=[
|
||||||
replaceHardS(),
|
fixUnicode(),
|
||||||
|
replaceRockDots(),
|
||||||
remove_words_containing_topLVL(),
|
remove_words_containing_topLVL(),
|
||||||
replaceSpecialChars(),
|
seperate_words_on_regex()
|
||||||
]
|
]
|
||||||
|
|
||||||
tok_comp=[
|
tok_comp=[
|
||||||
removeENT(["PERSON"]),
|
#removeENT(["PERSON"]),
|
||||||
remove_words_containing_Numbers(),
|
remove_words_containing_Numbers(),
|
||||||
#keepPOS(["NOUN"]),
|
|
||||||
removePOS(["PUNCT","SPACE","NUM"]),
|
removePOS(["PUNCT","SPACE","NUM"]),
|
||||||
removeWords(de_stop_words),
|
removeWords(de_stop_words),
|
||||||
|
|
||||||
remove_long_words(),
|
remove_long_words(),
|
||||||
remove_short_words(),
|
remove_short_words(),
|
||||||
|
|
||||||
remove_first_names()
|
remove_first_names(),
|
||||||
|
|
||||||
|
#keepPOS(["NOUN"]),
|
||||||
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
single_doc_func = lemmatize()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
@ -332,7 +364,7 @@ ticketcorpus = textacy.Corpus(DE_PARSER)
|
||||||
## add files to textacy-corpus,
|
## add files to textacy-corpus,
|
||||||
printlog("add texts to textacy-corpus")
|
printlog("add texts to textacy-corpus")
|
||||||
ticketcorpus.add_texts(
|
ticketcorpus.add_texts(
|
||||||
processTextstream(csv_to_contentStream(path2csv,"Description"), string_funclist=string_comp,tok_funclist=tok_comp)
|
processTextstream(csv_to_contentStream(path2csv,"Description"), string_funclist=string_comp, tok_funclist=tok_comp, single_doc_func=single_doc_func)
|
||||||
)
|
)
|
||||||
|
|
||||||
for i in range(10):
|
for i in range(10):
|
||||||
|
@ -362,33 +394,6 @@ for i in range(10):
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
words = [
|
|
||||||
"uniaccount",
|
|
||||||
"nr54065467",
|
|
||||||
"nr54065467",
|
|
||||||
"455a33c5,"
|
|
||||||
"tvt?=",
|
|
||||||
"tanja.saborowski@tu-dortmund.de",
|
|
||||||
"-",
|
|
||||||
"m-sw1-vl4053.itmc.tu-dortmund.de",
|
|
||||||
"------problem--------"
|
|
||||||
]
|
|
||||||
|
|
||||||
topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE)
|
|
||||||
specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE)
|
|
||||||
|
|
||||||
for w in words:
|
|
||||||
print(stringcleaning(w,string_comp))
|
|
||||||
#print(bool(re.search(r'\.[a-z]{2,3}(\.[a-z]{2,3})?',w)))
|
|
||||||
#print(bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]',w)))
|
|
||||||
#result = specialFinder.sub(" ", w)
|
|
||||||
#print(re.sub(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]'," ",w))
|
|
||||||
|
|
||||||
#print(re.sub(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', " ", w))
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,93 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import corenlp as corenlp
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import textacy
|
||||||
|
import nltk
|
||||||
|
from textblob_de import TextBlobDE
|
||||||
|
from textblob_de import PatternParser
|
||||||
|
|
||||||
|
filepath = "lemmatization-de.txt"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
blob = TextBlobDE(str(textacy.fileio.read_file("teststring.txt")),parser=PatternParser(pprint=True, lemmata=True))
|
||||||
|
|
||||||
|
print(blob.parse())
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#erste spalte zu {lemma : id} . zweite spalte zu {word : id}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
"""http://www.lexiconista.com/datasets/lemmatization/"""
|
||||||
|
|
||||||
|
lemma2id = {}
|
||||||
|
word2id = {}
|
||||||
|
|
||||||
|
for id,line in enumerate(list(textacy.fileio.read_file_lines(filepath=filepath))):
|
||||||
|
|
||||||
|
lemma = line.split()[0].strip().lower()
|
||||||
|
if lemma not in lemma2id:
|
||||||
|
lemma2id[lemma] = id
|
||||||
|
|
||||||
|
word = line.split()[1].strip().lower()
|
||||||
|
|
||||||
|
word2id[word] = lemma2id[word]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
regex_specialChars = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|</>?]'
|
||||||
|
|
||||||
|
|
||||||
|
def stringcleaning(stringstream, funclist):
|
||||||
|
for string in stringstream:
|
||||||
|
for f in funclist:
|
||||||
|
string = f(string)
|
||||||
|
yield string
|
||||||
|
|
||||||
|
|
||||||
|
def seperate_words_on_regex(regex=regex_specialChars):
|
||||||
|
return lambda string: " ".join(re.compile(regex).split(string))
|
||||||
|
|
||||||
|
|
||||||
|
words = [
|
||||||
|
"uniaccount",
|
||||||
|
"nr54065467",
|
||||||
|
"nr54065467",
|
||||||
|
"455a33c5,"
|
||||||
|
"tvt?=",
|
||||||
|
"tanja.saborowski@tu-dortmund.de",
|
||||||
|
"-",
|
||||||
|
"m-sw1-vl4053.itmc.tu-dortmund.de",
|
||||||
|
"------problem--------"
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE)
|
||||||
|
specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE)
|
||||||
|
|
||||||
|
for s in stringcleaning((w for w in words),[seperate_words_on_regex()]):
|
||||||
|
print(s.strip())
|
||||||
|
|
||||||
|
#print(stringcleaning(w,string_comp))
|
||||||
|
#print(bool(re.search(r'\.[a-z]{2,3}(\.[a-z]{2,3})?',w)))
|
||||||
|
#print(bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]',w)))
|
||||||
|
#result = specialFinder.sub(" ", w)
|
||||||
|
#print(re.sub(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]'," ",w))
|
||||||
|
|
||||||
|
#print(re.sub(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', " ", w))
|
||||||
|
"""
|
Loading…
Reference in New Issue