lauffähige version
This commit is contained in:
parent
ecc8c0c54a
commit
0a6a68b8aa
383
backup.py
383
backup.py
|
@ -1,383 +0,0 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
|
||||
############# misc
|
||||
|
||||
def printlog(string, level="INFO"):
|
||||
"""log and prints"""
|
||||
print(string)
|
||||
if level=="INFO":
|
||||
logging.info(string)
|
||||
elif level=="DEBUG":
|
||||
logging.debug(string)
|
||||
elif level == "WARNING":
|
||||
logging.warning(string)
|
||||
printlog("Load functions")
|
||||
|
||||
def compose(*functions):
|
||||
def compose2(f, g):
|
||||
return lambda x: f(g(x))
|
||||
return functools.reduce(compose2, functions, lambda x: x)
|
||||
|
||||
def get_calling_function():
|
||||
"""finds the calling function in many decent cases.
|
||||
https://stackoverflow.com/questions/39078467/python-how-to-get-the-calling-function-not-just-its-name
|
||||
"""
|
||||
fr = sys._getframe(1) # inspect.stack()[1][0]
|
||||
co = fr.f_code
|
||||
for get in (
|
||||
lambda:fr.f_globals[co.co_name],
|
||||
lambda:getattr(fr.f_locals['self'], co.co_name),
|
||||
lambda:getattr(fr.f_locals['cls'], co.co_name),
|
||||
lambda:fr.f_back.f_locals[co.co_name], # nested
|
||||
lambda:fr.f_back.f_locals['func'], # decorators
|
||||
lambda:fr.f_back.f_locals['meth'],
|
||||
lambda:fr.f_back.f_locals['f'],
|
||||
):
|
||||
try:
|
||||
func = get()
|
||||
except (KeyError, AttributeError):
|
||||
pass
|
||||
else:
|
||||
if func.__code__ == co:
|
||||
return func
|
||||
raise AttributeError("func not found")
|
||||
|
||||
|
||||
def printRandomDoc(textacyCorpus):
|
||||
import random
|
||||
print()
|
||||
|
||||
printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
|
||||
randIndex = int((len(textacyCorpus) - 1) * random.random())
|
||||
printlog("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata))
|
||||
|
||||
print()
|
||||
|
||||
############# load xml
|
||||
def generateMainTextfromTicketXML(path2xml, main_textfield='Description'):
|
||||
"""
|
||||
generates strings from XML
|
||||
:param path2xml:
|
||||
:param main_textfield:
|
||||
:param cleaning_function:
|
||||
:yields strings
|
||||
"""
|
||||
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
|
||||
root = tree.getroot()
|
||||
|
||||
for ticket in root:
|
||||
for field in ticket:
|
||||
if field.tag == main_textfield:
|
||||
yield field.text
|
||||
|
||||
def generateMetadatafromTicketXML(path2xml, leave_out=['Description']):
|
||||
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
|
||||
root = tree.getroot()
|
||||
|
||||
for ticket in root:
|
||||
metadata = {}
|
||||
for field in ticket:
|
||||
if field.tag not in leave_out:
|
||||
|
||||
metadata[field.tag] = field.text
|
||||
|
||||
yield metadata
|
||||
|
||||
|
||||
############# load csv
|
||||
|
||||
def csv_to_contentStream(path2csv: str, content_collumn_name: str):
|
||||
"""
|
||||
:param path2csv: string
|
||||
:param content_collumn_name: string
|
||||
:return: string-generator
|
||||
"""
|
||||
stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8')
|
||||
content_collumn = 0 # standardvalue
|
||||
|
||||
for i,lst in enumerate(stream):
|
||||
if i == 0:
|
||||
# look for desired column
|
||||
for j,col in enumerate(lst):
|
||||
if col == content_collumn_name:
|
||||
content_collumn = j
|
||||
else:
|
||||
yield lst[content_collumn]
|
||||
|
||||
def csv_to_metaStream(path2csv: str, metalist: [str]):
|
||||
"""
|
||||
:param path2csv: string
|
||||
:param metalist: list of strings
|
||||
:return: dict-generator
|
||||
"""
|
||||
stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8')
|
||||
|
||||
content_collumn = 0 # standardvalue
|
||||
metaindices = []
|
||||
metadata_temp = {}
|
||||
for i,lst in enumerate(stream):
|
||||
if i == 0:
|
||||
for j,col in enumerate(lst): # geht bestimmt effizienter... egal, weil passiert nur einmal
|
||||
for key in metalist:
|
||||
if key == col:
|
||||
metaindices.append(j)
|
||||
metadata_temp = dict(zip(metalist,metaindices)) # zB {'Subject' : 1, 'categoryName' : 3, 'Solution' : 10}
|
||||
|
||||
else:
|
||||
metadata = metadata_temp.copy()
|
||||
for key,value in metadata.items():
|
||||
metadata[key] = lst[value]
|
||||
yield metadata
|
||||
|
||||
|
||||
|
||||
############################################ Preprocessing ##############################################
|
||||
|
||||
|
||||
############# on str-gen
|
||||
|
||||
def processTokens(tokens, funclist, parser):
|
||||
# in:tokenlist, funclist
|
||||
# out: tokenlist
|
||||
for f in funclist:
|
||||
# idee: funclist sortieren,s.d. erst alle string-methoden ausgeführt werden, dann wird geparesed, dann wird auf tokens gearbeitet, dann evtl. auf dem ganzen Doc
|
||||
|
||||
if 'bool' in str(f.__annotations__):
|
||||
tokens = list(filter(f, tokens))
|
||||
|
||||
elif 'str' in str(f.__annotations__):
|
||||
tokens = list(map(f, tokens)) # purer text
|
||||
doc = parser(" ".join(tokens)) # neu parsen
|
||||
tokens = [tok for tok in doc] # nur tokens
|
||||
|
||||
elif 'spacy.tokens.doc.Doc' in str(f.__annotations__):
|
||||
#todo wirkt gefrickelt
|
||||
doc = parser(" ".join(tok.lower_ for tok in tokens)) # geparsed
|
||||
tokens = f(doc)
|
||||
doc = parser(" ".join(tokens)) # geparsed
|
||||
tokens = [tok for tok in doc] # nur tokens
|
||||
else:
|
||||
warnings.warn("Unknown Annotation while preprocessing. Function: {0}".format(str(f)))
|
||||
|
||||
return tokens
|
||||
|
||||
def processTextstream(textstream, funclist, parser=DE_PARSER):
|
||||
"""
|
||||
:param textstream: string-gen
|
||||
:param funclist: [func]
|
||||
:param parser: spacy-parser
|
||||
:return: string-gen
|
||||
"""
|
||||
# input:str-stream output:str-stream
|
||||
pipe = parser.pipe(textstream)
|
||||
|
||||
for doc in pipe:
|
||||
|
||||
tokens = []
|
||||
for tok in doc:
|
||||
tokens.append(tok)
|
||||
|
||||
tokens = processTokens(tokens,funclist,parser)
|
||||
yield " ".join([tok.lower_ for tok in tokens])
|
||||
|
||||
def processDictstream(dictstream, funcdict, parser=DE_PARSER):
|
||||
"""
|
||||
|
||||
:param dictstream: dict-gen
|
||||
:param funcdict:
|
||||
clean_in_meta = {
|
||||
"Solution":funclist,
|
||||
...
|
||||
}
|
||||
|
||||
:param parser: spacy-parser
|
||||
:return: dict-gen
|
||||
"""
|
||||
for dic in dictstream:
|
||||
result = {}
|
||||
for key, value in dic.items():
|
||||
|
||||
if key in funcdict:
|
||||
|
||||
doc = parser(value)
|
||||
tokens = [tok for tok in doc]
|
||||
funclist = funcdict[key]
|
||||
|
||||
tokens = processTokens(tokens,funclist,parser)
|
||||
|
||||
|
||||
result[key] = " ".join([tok.lower_ for tok in tokens])
|
||||
|
||||
|
||||
else:
|
||||
result[key] = value
|
||||
yield result
|
||||
|
||||
|
||||
############# return bool
|
||||
|
||||
def keepPOS(pos_list) -> bool:
|
||||
ret = lambda tok : tok.pos_ in pos_list
|
||||
|
||||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
def removePOS(pos_list)-> bool:
|
||||
ret = lambda tok : tok.pos_ not in pos_list
|
||||
|
||||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
def removeWords(words, keep=None)-> bool:
|
||||
if hasattr(keep, '__iter__'):
|
||||
for k in keep:
|
||||
try:
|
||||
words.remove(k)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
ret = lambda tok : tok.lower_ not in words
|
||||
|
||||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
def keepENT(ent_list) -> bool:
|
||||
ret = lambda tok : tok.ent_type_ in ent_list
|
||||
|
||||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
def removeENT(ent_list) -> bool:
|
||||
ret = lambda tok: tok.ent_type_ not in ent_list
|
||||
|
||||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
def remove_words_containing_Numbers() -> bool:
|
||||
ret = lambda tok: not bool(re.search('\d', tok.lower_))
|
||||
|
||||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
|
||||
def remove_words_containing_specialCharacters() -> bool:
|
||||
ret = lambda tok: not bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./<>?]', tok.lower_))
|
||||
|
||||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
|
||||
def remove_words_containing_topLVL() -> bool:
|
||||
ret = lambda tok: not bool(re.search(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', tok.lower_))
|
||||
|
||||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
|
||||
def lemmatizeWord(word,filepath=LEMMAS):
|
||||
"""http://www.lexiconista.com/datasets/lemmatization/"""
|
||||
for line in list(textacy.fileio.read_file_lines(filepath=filepath)):
|
||||
if word.lower() == line.split()[1].strip().lower():
|
||||
return line.split()[0].strip().lower()
|
||||
return word.lower() # falls nix gefunden wurde
|
||||
|
||||
def lemmatize() -> str:
|
||||
ret = lambda tok: lemmatizeWord(tok.lower_)
|
||||
|
||||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
|
||||
############# return strings
|
||||
|
||||
mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
|
||||
emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
|
||||
urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
|
||||
topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE)
|
||||
specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE)
|
||||
hardSFinder = re.compile(r'[ß]', re.IGNORECASE)
|
||||
|
||||
|
||||
|
||||
def replaceEmails(replace_with="EMAIL") -> str:
|
||||
ret = lambda tok : emailFinder.sub(replace_with, tok.lower_)
|
||||
|
||||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
def replaceURLs(replace_with="URL") -> str:
|
||||
ret = lambda tok: textacy.preprocess.replace_urls(tok.lower_,replace_with=replace_with)
|
||||
#ret = lambda tok: urlFinder.sub(replace_with,tok.lower_)
|
||||
|
||||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
def replaceSpecialChars(replace_with=" ") -> str:
|
||||
ret = lambda tok: specialFinder.sub(replace_with,tok.lower_)
|
||||
|
||||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
|
||||
def replaceTwitterMentions(replace_with="TWITTER_MENTION") -> str:
|
||||
ret = lambda tok : mentionFinder.sub(replace_with,tok.lower_)
|
||||
|
||||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
def replaceNumbers(replace_with="NUMBER") -> str:
|
||||
ret = lambda tok: textacy.preprocess.replace_numbers(tok.lower_, replace_with=replace_with)
|
||||
|
||||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
def replacePhonenumbers(replace_with="PHONENUMBER") -> str:
|
||||
ret = lambda tok: textacy.preprocess.replace_phone_numbers(tok.lower_, replace_with=replace_with)
|
||||
|
||||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
def replaceHardS(replace_with="ss") -> str:
|
||||
ret = lambda tok: hardSFinder.sub(replace_with,tok.lower_)
|
||||
|
||||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
|
||||
def fixUnicode() -> str:
|
||||
ret = lambda tok: textacy.preprocess.fix_bad_unicode(tok.lower_, normalization=u'NFC')
|
||||
|
||||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def resolveAbbreviations():
|
||||
pass #todo
|
||||
|
||||
#todo wörter mit len < 2 entfernen( vorher abkürzungen (v.a. tu und fh) auflösen) und > 35 oder 50 ("Reiserücktrittskostenversicherung)
|
||||
|
||||
############# return docs
|
||||
|
||||
def keepUniqeTokens() -> spacy.tokens.Doc:
|
||||
ret = lambda doc: (set([tok.lower_ for tok in doc]))
|
||||
|
||||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
def lower() -> spacy.tokens.Doc:
|
||||
ret = lambda doc: ([tok.lower_ for tok in doc])
|
||||
|
||||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
|
||||
################################################################################################################
|
36
cleaning.py
36
cleaning.py
|
@ -11,6 +11,9 @@ from scipy import *
|
|||
|
||||
import os
|
||||
|
||||
from preprocessing import removePOS
|
||||
from preprocessing import filterTokens
|
||||
|
||||
csv.field_size_limit(sys.maxsize)
|
||||
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
|
||||
|
||||
|
@ -24,11 +27,6 @@ with open(config_ini) as f:
|
|||
config.read_file(f)
|
||||
|
||||
|
||||
global REGEX_SPECIALCHAR
|
||||
|
||||
global WORDS
|
||||
|
||||
|
||||
REGEX_SPECIALCHAR = r'[`\=~%^&*()_+\[\]{};\'"|</>]' #+r',.-\\:' #+r',.?!'
|
||||
|
||||
WORDS= {}
|
||||
|
@ -113,15 +111,12 @@ def clean(stringstream,autocorrect=False):
|
|||
string = re.sub(r'[ü]', "ue", string)
|
||||
string = re.sub(r'[ä]', "ae", string)
|
||||
|
||||
# frage autocorrect?
|
||||
#idee http://lexitron.nectec.or.th/public/COLING-2010_Beijing_China/POSTERS/pdf/POSTERS022.pdf
|
||||
#frage autocorrect? idee http://lexitron.nectec.or.th/public/COLING-2010_Beijing_China/POSTERS/pdf/POSTERS022.pdf
|
||||
if autocorrect:
|
||||
string = " ".join([autocorrectWord(word) for word in string.split()])
|
||||
|
||||
yield string
|
||||
|
||||
|
||||
|
||||
def processDictstream(dictstream, funcdict, parser):
|
||||
"""
|
||||
|
||||
|
@ -154,30 +149,21 @@ def processDictstream(dictstream, funcdict, parser):
|
|||
result[key] = value
|
||||
yield result
|
||||
|
||||
def filterTokens(tokens, funclist):
|
||||
# in:tokenlist, funclist
|
||||
# out: tokenlist
|
||||
for f in funclist:
|
||||
tokens = list(filter(f, tokens))
|
||||
|
||||
return tokens
|
||||
|
||||
def removePOS(pos_list):
|
||||
return lambda tok: tok.pos_ not in pos_list
|
||||
|
||||
##################################################################################################
|
||||
|
||||
ressources_path = FILEPATH + "ressources/"
|
||||
|
||||
path2wordsdict = FILEPATH + config.get("spellchecking", "pickle_file")
|
||||
path2wordsdict = ressources_path + config.get("spellchecking", "pickle_file")
|
||||
|
||||
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
|
||||
|
||||
corpus_en_path = FILEPATH + config.get("en_corpus", "path")
|
||||
|
||||
autocorrect = config.getboolean("preprocessing", "autocorrect")
|
||||
|
||||
|
||||
|
||||
def cleanCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", printrandom=10):
|
||||
def cleanCorpus(corpus_path, clean_in_meta, lang="de", printrandom=10,autocorrect=False):
|
||||
|
||||
logprint("Clean {0}_corpus at {1}".format(lang, datetime.now()))
|
||||
|
||||
|
@ -192,7 +178,7 @@ def cleanCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", printrando
|
|||
|
||||
## process and add files to textacy-corpi,
|
||||
clean_corpus.add_texts(
|
||||
clean(corpus2Text(raw_corpus)),
|
||||
clean(corpus2Text(raw_corpus),autocorrect=autocorrect),
|
||||
processDictstream(corpus2Meta(raw_corpus), clean_in_meta,parser=parser)
|
||||
)
|
||||
|
||||
|
@ -220,8 +206,6 @@ def main():
|
|||
|
||||
WORDS = load_obj(path2wordsdict)
|
||||
|
||||
clean_in_content = [] #frage notwendig?
|
||||
|
||||
|
||||
clean_in_meta = {
|
||||
"Solution": [removePOS(["SPACE"])],
|
||||
|
@ -229,7 +213,7 @@ def main():
|
|||
"categoryName": [removePOS(["SPACE", "PUNCT"])]
|
||||
}
|
||||
|
||||
corpus = cleanCorpus(corpus_de_path, clean_in_content, clean_in_meta, "de",printrandom=5 )
|
||||
corpus = cleanCorpus(corpus_de_path, clean_in_meta, "de",printrandom=5, autocorrect=autocorrect )
|
||||
|
||||
end = time.time()
|
||||
logprint("Time Elapsed Cleaning:{0} min".format((end - start) / 60))
|
||||
|
|
|
@ -1,24 +0,0 @@
|
|||
Index: 0
|
||||
Text: lieber support, ich habe gerade versucht mich mit meiner unicard im firefox browser fuer das service portal zu authentifizieren. das hat vor einigen wochen noch tadelos geklappt und mittlerweile bekomme ich folgende fehlermeldung ich hoffe sie koennen mir weiterhelfen. vielen dank und viele gruesse sascha feldhorst dipl. inform. sascha feldhorst wiss. ang. technische universitaet dortmund maschinenbau lehrstuhl fuer foerder und lagerwesen logistikcampus joseph von fraunhofer str. 2 4 d 44227 dortmund tel. 49 231 755 40 73 fax 49 231 755 47 68 mailto sascha.feldhorst@tu dortmund.de sascha.feldhorst@tu dortmund.de http www.flw.mb.tu dortmund.de www.flw.mb.tu dortmund.de wichtiger hinweis die information in dieser e mail ist vertraulich. sie ist ausschliesslich fuer den adressaten bestimmt. sollten sie nicht der fuer diese e mail bestimmte adressat sein, unterrichten sie bitte den absender und vernichten sie diese mail. vielen dank. unbeschadet der korrespondenz per e mail, sind unsere erklaerungen ausschliesslich final rechtsverbindlich, wenn sie in herkoemmlicher schriftform mit eigenhaendiger unterschrift oder durch uebermittlung eines solchen schriftstuecks per telefax erfolgen. important note the information included in this e mail is confidential. it is solely intended for the recipient. if you are not the intended recipient of this e mail please contact the sender and delete this message. thank you. without prejudice of e mail correspondence, our statements are only legally binding when they are made in the conventional written form with personal signature or when such documents are sent by fax.
|
||||
categoryName: betrieb
|
||||
|
||||
Index: 0
|
||||
Text: support browser service portal mittlerweile
|
||||
categoryName: betrieb
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Index: 1
|
||||
Text: telefon umzug antragsteller astrid gramm astrid.gramm@tu dortmund.de terminvorschlag 14.08.2015 einrichtung dezernat 2 abteilung 2.5 psp element uniaccount mnichofm hofmann, nicole gebaeude dezernat 5 raum id 201 651430 telefondose neztwerkdose dt04.5 04.6 telefonnr. 4821 eintrag telefonbuch e mail astrid.gramm@tu dortmund.de voicemail ansprechpartner astrid gramm tel. ansprechpartner 5444 verantwortlicher nutzer type bemerkung frau hofmann wird am 14.08.2015 in die wd 2 umziehen. es ist der raum 201a im og nicht 201 eine bezeichnung der telefondose ist nicht vorhanden.
|
||||
categoryName: elektronisches telefonbuch
|
||||
|
||||
Index: 1
|
||||
Text: telefon umzug antragsteller gramm einrichtung dezernat abteilung element gebaeude dezernat raum id eintrag telefonbuch mail ansprechpartner gramm ansprechpartner verantwortlicher nutzer type bemerkung raum bezeichnung
|
||||
categoryName: elektronisches telefonbuch
|
11
config.ini
11
config.ini
|
@ -37,12 +37,12 @@ pickle_file=en_stopwords_list.pkl
|
|||
|
||||
[logging]
|
||||
level=INFO
|
||||
filename=topicModelTickets.log
|
||||
filename=log/topicModelTickets.log
|
||||
|
||||
|
||||
[de_corpus]
|
||||
input=M42-Export/Tickets_small.csv
|
||||
#input=M42-Export/de_tickets.csv
|
||||
#input=M42-Export/Tickets_small.csv
|
||||
input=M42-Export/de_tickets.csv
|
||||
|
||||
path=corpi/
|
||||
|
||||
|
@ -64,7 +64,10 @@ metaliste=TicketNumber,Subject,CreatedDate,categoryName,Impact,Urgency,BenutzerI
|
|||
|
||||
#ents2keep=WORK_OF_ART,ORG,PRODUCT,LOC
|
||||
|
||||
custom_words=eintrag,element,nutzer,einrichtung,abteilung,gebaeude,raum,ansprechpartner,geehrt,dr,not,frage,betreff,gerne,dame,herr,frau,hilfe,moeglichkeit,beste,freuen,voraus,problem,lauten,bedanken,voraus,hallo,gerne,freundlich,fragen,fehler,bitten,ehre,lieb,liebe,gruesse,helfen,versuchen,unbestimmt,woche,tadelos,klappen,mittlerweile,bekommen,erreichbar,gruss,auffahren,vorgang,hinweis,name,gruss,id,erfolg,folge,team,absender,versenden,vorname,strasse,prozess,portal,moeglichkeit,fremd,wende,rueckfrage,stehen,verfuegung,funktionieren,pruefen,hoffen,ok
|
||||
autocorrect = false
|
||||
#true
|
||||
|
||||
custom_words=aenderung,hahn,verantwortlicher,rolle,status,fehlgeschlagen,aenderung,test,erwuenscht,antragsteller,bemerkung,tu,uni,prof,bezeichnung,gramm,type,eintrag,element,nutzer,einrichtung,abteilung,gebaeude,raum,ansprechpartner,geehrt,dr,not,frage,betreff,gerne,dame,herr,frau,hilfe,moeglichkeit,beste,freuen,voraus,problem,lauten,bedanken,voraus,hallo,gerne,freundlich,fragen,fehler,bitten,ehre,lieb,liebe,gruesse,helfen,versuchen,unbestimmt,woche,tadelos,klappen,mittlerweile,bekommen,erreichbar,gruss,auffahren,vorgang,hinweis,name,gruss,id,erfolg,folge,team,absender,versenden,vorname,strasse,prozess,portal,moeglichkeit,fremd,wende,rueckfrage,stehen,verfuegung,funktionieren,pruefen,hoffen,ok,januar,februar,maerz,april,mai,juni,juli,august,september,oktober,november,dezember
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -23,8 +23,6 @@ with open(config_ini) as f:
|
|||
|
||||
|
||||
|
||||
|
||||
|
||||
def ticketcsv_to_textStream(path2csv: str, content_collumn_name: str):
|
||||
"""
|
||||
:param path2csv: string
|
||||
|
@ -75,27 +73,9 @@ def ticket_csv_to_DictStream(path2csv: str, metalist: [str]):
|
|||
##################################################################################################
|
||||
|
||||
|
||||
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corporization.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_corporization.log &"
|
||||
|
||||
|
||||
"""
|
||||
content_collumn_name = "Description"
|
||||
metaliste = [
|
||||
"TicketNumber",
|
||||
"Subject",
|
||||
"CreatedDate",
|
||||
"categoryName",
|
||||
"Impact",
|
||||
"Urgency",
|
||||
"BenutzerID",
|
||||
"VerantwortlicherID",
|
||||
"EigentuemerID",
|
||||
"Solution"
|
||||
]
|
||||
"""
|
||||
|
||||
content_collumn_name = config.get("tickets","content_collumn_name")
|
||||
metaliste = list(map(normalize_whitespace,config.get("tickets","metaliste").split(",")))
|
||||
metaliste = get_list_from_config("tickets","metaliste")
|
||||
|
||||
|
||||
path2de_csv = FILEPATH + config.get("de_corpus","input")
|
||||
|
@ -110,7 +90,6 @@ corpus_en_path = FILEPATH + config.get("en_corpus", "path")
|
|||
def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, metaliste, lang, printrandom=0):
|
||||
|
||||
|
||||
|
||||
# print paths
|
||||
path_csv_split = path2_csv.split("/")
|
||||
filename = path_csv_split[len(path_csv_split) - 1]
|
||||
|
@ -121,8 +100,6 @@ def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, metaliste, la
|
|||
raw_corpus = textacy.Corpus(lang)
|
||||
|
||||
## add files to textacy-corpi,
|
||||
#printlog("Add texts to {0}_textacy-corpi".format(lang))
|
||||
|
||||
raw_corpus.add_texts(
|
||||
ticketcsv_to_textStream(path2_csv, content_collumn_name),
|
||||
ticket_csv_to_DictStream(path2_csv, metaliste)
|
||||
|
@ -132,6 +109,7 @@ def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, metaliste, la
|
|||
# leere docs aus corpi kicken
|
||||
raw_corpus.remove(lambda doc: len(doc) == 0)
|
||||
|
||||
logprint("corpus-lenght: {}".format(len(raw_corpus)))
|
||||
#random Doc printen
|
||||
for i in range(printrandom):
|
||||
printRandomDoc(raw_corpus)
|
||||
|
|
1855
german_stopwords.txt
1855
german_stopwords.txt
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
33
init.py
33
init.py
|
@ -237,36 +237,37 @@ def build_words_for_spellchecking(path2words):
|
|||
##################################################################################################
|
||||
|
||||
# THESAURUS
|
||||
path2wordnet = FILEPATH + config.get("thesaurus","input")
|
||||
path2thesaurus_dict = FILEPATH + config.get("thesaurus","pickle_file")
|
||||
ressources_path = FILEPATH + "ressources/"
|
||||
path2wordnet = ressources_path + config.get("thesaurus","input")
|
||||
path2thesaurus_dict = ressources_path + config.get("thesaurus","pickle_file")
|
||||
|
||||
|
||||
# SPELLCHECKING
|
||||
path2words_file = FILEPATH + config.get("spellchecking","input")
|
||||
path2wordlist = FILEPATH + config.get("spellchecking","pickle_file")
|
||||
path2words_file = ressources_path + config.get("spellchecking","input")
|
||||
path2wordlist = ressources_path + config.get("spellchecking","pickle_file")
|
||||
|
||||
|
||||
# LEMMA
|
||||
path2lemma_file = FILEPATH + config.get("lemmatization","input")
|
||||
path2lemmadict = FILEPATH + config.get("lemmatization","pickle_file")
|
||||
path2lemma_file = ressources_path + config.get("lemmatization","input")
|
||||
path2lemmadict = ressources_path + config.get("lemmatization","pickle_file")
|
||||
|
||||
# NOMEN
|
||||
nouns1 = FILEPATH + config.get("nouns","input1")
|
||||
nouns2 = FILEPATH + config.get("nouns","input2")
|
||||
path2nouns_list = FILEPATH + config.get("nouns","pickle_file")
|
||||
nouns1 = ressources_path + config.get("nouns","input1")
|
||||
nouns2 = ressources_path + config.get("nouns","input2")
|
||||
path2nouns_list = ressources_path + config.get("nouns","pickle_file")
|
||||
|
||||
|
||||
# VORNAMEN
|
||||
firstnames_txt = FILEPATH + config.get("firstnames","input")
|
||||
path2firstnameslist = FILEPATH + config.get("firstnames","pickle_file")
|
||||
firstnames_txt = ressources_path + config.get("firstnames","input")
|
||||
path2firstnameslist = ressources_path + config.get("firstnames","pickle_file")
|
||||
|
||||
# STOPWORDS
|
||||
stop1 = FILEPATH + config.get("de_stopwords","input1")
|
||||
stop2 = FILEPATH + config.get("de_stopwords","input2")
|
||||
stop3 = FILEPATH + config.get("de_stopwords","input3")
|
||||
path2stopwordlist_de = FILEPATH + config.get("de_stopwords","pickle_file")
|
||||
stop1 = ressources_path + config.get("de_stopwords","input1")
|
||||
stop2 = ressources_path + config.get("de_stopwords","input2")
|
||||
stop3 = ressources_path + config.get("de_stopwords","input3")
|
||||
path2stopwordlist_de = ressources_path + config.get("de_stopwords","pickle_file")
|
||||
|
||||
path2stopwordlist_en = FILEPATH + config.get("en_stopwords","pickle_file")
|
||||
path2stopwordlist_en = ressources_path + config.get("en_stopwords","pickle_file")
|
||||
|
||||
|
||||
|
||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
358474
lemmatization-de.txt
358474
lemmatization-de.txt
File diff suppressed because it is too large
Load Diff
654747
lexicalentries.xml
654747
lexicalentries.xml
File diff suppressed because it is too large
Load Diff
21
main.py
21
main.py
|
@ -11,12 +11,12 @@ import cleaning
|
|||
|
||||
from miscellaneous import *
|
||||
|
||||
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/main.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_main.log &"
|
||||
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/main.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/log/printout_main.log &"
|
||||
start = time.time()
|
||||
|
||||
|
||||
|
||||
#init.main()
|
||||
init.main()
|
||||
logprint("")
|
||||
|
||||
corporization.main()
|
||||
|
@ -30,32 +30,23 @@ logprint("")
|
|||
|
||||
|
||||
"""
|
||||
topicModeling.main(use_raw=False,algorithm="lsa")
|
||||
#topicModeling.main(use_cleaned=False,algorithm="lsa")
|
||||
logprint("")
|
||||
|
||||
|
||||
topicModeling.main(use_raw=False,algorithm="lda")
|
||||
#topicModeling.main(use_cleaned=False,algorithm="nmf")
|
||||
logprint("")
|
||||
|
||||
|
||||
topicModeling.main(use_raw=False,algorithm="nmf")
|
||||
#topicModeling.main(use_cleaned=False,algorithm="lda")
|
||||
logprint("")
|
||||
|
||||
|
||||
topicModeling.main(use_raw=False,algorithm="llda")
|
||||
topicModeling.main(use_cleaned=False,algorithm="llda")
|
||||
logprint("")
|
||||
"""
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
logprint("")
|
||||
|
||||
end = time.time()
|
||||
logprint("Total Time Elapsed: {0} min".format((end - start) / 60))
|
||||
|
||||
|
|
112
miscellaneous.py
112
miscellaneous.py
|
@ -153,6 +153,25 @@ def printRandomDoc(textacyCorpus):
|
|||
|
||||
print()
|
||||
|
||||
def get_list_from_config(section,option):
|
||||
return list(map(textacy.preprocess.normalize_whitespace,config.get(section,option).split(",")))
|
||||
|
||||
def corpus2Text(corpus):
|
||||
for doc in corpus:
|
||||
yield doc.text
|
||||
|
||||
def corpus2Meta(corpus):
|
||||
for doc in corpus:
|
||||
yield doc.metadata
|
||||
|
||||
def savelabledCorpiLines(corpus,filepath):
|
||||
|
||||
textacy.fileio.write_file_lines(gen_labledLines(corpus), filepath=filepath)
|
||||
|
||||
def gen_labledLines(corpus):
|
||||
for doc in corpus:
|
||||
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpi
|
||||
yield "[" + doc.metadata["categoryName"] + "] " + doc.text
|
||||
|
||||
|
||||
def save_corpus(corpus, corpus_path, corpus_name):
|
||||
|
@ -219,95 +238,6 @@ def load_corpus(corpus_path, corpus_name, lang="de"):
|
|||
for key,value in plain.items():
|
||||
if key != "content" and key != "index":
|
||||
meta[key] = value
|
||||
corpus.add_doc(textacy.Doc(plain["content"], lang=corpus.spacy_lang, metadata=meta))
|
||||
|
||||
return corpus, corpus.spacy_lang
|
||||
|
||||
|
||||
|
||||
|
||||
"""
|
||||
def corpus2Text(corpus):
|
||||
for doc in corpus:
|
||||
yield doc.text
|
||||
|
||||
def corpus2Meta(corpus):
|
||||
for doc in corpus:
|
||||
yield doc.metadata
|
||||
|
||||
def saveplaincorpustext(corpus,path):
|
||||
textacy.fileio.write_file_lines(corpus2Text(corpus),filepath=path )
|
||||
|
||||
def save_corpusV2(corpus, corpus_path, corpus_name):
|
||||
|
||||
|
||||
# save parser
|
||||
parser = corpus.spacy_lang
|
||||
parserpath = corpus_path + str(parser.lang) + '_parser'
|
||||
parser.save_to_directory(parserpath)
|
||||
|
||||
|
||||
contentpath = corpus_path +corpus_name + "_docs/"
|
||||
if not os.path.exists(contentpath):
|
||||
os.makedirs(contentpath)
|
||||
|
||||
for doc in corpus:
|
||||
with open(contentpath + str(doc.corpus_index) + "_doc.bin", 'w') as f:
|
||||
f.write(doc.spacy_doc.to_bytes())
|
||||
with open(contentpath + str(doc.corpus_index) + "_meta.json", 'w') as file:
|
||||
file.write(json.dumps(doc.metadata))
|
||||
|
||||
def load_corpusV2(corpus_path, corpus_name, lang="de"):
|
||||
|
||||
|
||||
# ckeck for language
|
||||
if "de_" in corpus_name:
|
||||
lang = "de"
|
||||
elif "en_" in corpus_name:
|
||||
lang = "en"
|
||||
|
||||
# load parser
|
||||
parser = spacy.load(lang)
|
||||
|
||||
stringstorepath = corpus_path + str(lang) + '_parser' + '/vocab/strings.json'
|
||||
with open(stringstorepath) as file:
|
||||
parser.vocab.strings.load(file)
|
||||
|
||||
vocabpath = Path(corpus_path + str(lang) + '_parser' + '/vocab/lexemes.bin')
|
||||
parser.vocab.load_lexemes(vocabpath)
|
||||
|
||||
# load corpus
|
||||
corpus = textacy.Corpus(parser)
|
||||
|
||||
contentpath = corpus_path + corpus_name + "_docs/"
|
||||
docs = yield_fromdir(contentpath,spacy_vocab=corpus.spacy_vocab,type="doc")
|
||||
metas = yield_fromdir(contentpath,type="meta")
|
||||
|
||||
for doc,meta in zip(docs,metas):
|
||||
corpus.add_doc(
|
||||
textacy.Doc(doc, lang=corpus.spacy_lang, metadata=meta))
|
||||
|
||||
|
||||
return corpus, corpus.spacy_lang
|
||||
|
||||
def yield_fromdir(path,spacy_vocab=None,type=".pkl"):
|
||||
os.chdir(path)
|
||||
filelist = [name for name in os.listdir('.') if os.path.isfile(name)]
|
||||
filelist = [filename for filename in filelist if type in filename]
|
||||
filelist.sort(key = lambda elem : elem.split("_")[0])
|
||||
|
||||
|
||||
if type =='doc':
|
||||
for filename in filelist:
|
||||
with open(path+filename,'r') as f:
|
||||
for bytes_string in SpacyDoc.read_bytes(f):
|
||||
yield SpacyDoc(spacy_vocab).from_bytes(bytes_string)
|
||||
elif type == 'meta':
|
||||
for filename in filelist:
|
||||
with open(path+filename,'r') as f:
|
||||
yield json.load(f)
|
||||
else:
|
||||
for filename in filelist:
|
||||
yield load_obj(path+filename)
|
||||
"""
|
||||
corpus.add_doc(textacy.Doc(plain["content"], lang=corpus.spacy_lang, metadata=meta))
|
||||
|
||||
return corpus, corpus.spacy_lang
|
|
@ -1,466 +0,0 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import csv
|
||||
import random
|
||||
import sys
|
||||
|
||||
import spacy
|
||||
import textacy
|
||||
|
||||
"""
|
||||
import keras
|
||||
import numpy as np
|
||||
from keras.layers import Dense, SimpleRNN, LSTM, TimeDistributed, Dropout
|
||||
from keras.models import Sequential
|
||||
import keras.backend as K
|
||||
"""
|
||||
csv.field_size_limit(sys.maxsize)
|
||||
|
||||
"""
|
||||
def getFirstSynonym(word, thesaurus_gen):
|
||||
|
||||
word = word.lower()
|
||||
# TODO word cleaning https://stackoverflow.com/questions/3939361/remove-specific-characters-from-a-string-in-python
|
||||
|
||||
|
||||
# durch den thesaurrus iterieren
|
||||
for syn_block in thesaurus_gen: # syn_block ist eine liste mit Synonymen
|
||||
|
||||
# durch den synonymblock iterieren
|
||||
for syn in syn_block:
|
||||
syn = syn.lower().split(" ") if not re.match(r'\A[\w-]+\Z', syn) else syn # aus synonym mach liste (um evtl. sätze zu identifieziren)
|
||||
|
||||
# falls das wort in dem synonym enthalten ist (also == einem Wort in der liste ist)
|
||||
if word in syn:
|
||||
|
||||
# Hauptform suchen
|
||||
if "auptform" in syn:
|
||||
# nicht ausgeben, falls es in Klammern steht
|
||||
for w in syn:
|
||||
if not re.match(r'\([^)]+\)', w) and w is not None:
|
||||
return w
|
||||
|
||||
# falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
|
||||
if len(syn) == 1:
|
||||
w = syn[0]
|
||||
if not re.match(r'\([^)]+\)', w) and w is not None:
|
||||
return w
|
||||
|
||||
return word # zur Not die eingabe ausgeben
|
||||
|
||||
|
||||
"""
|
||||
"""
|
||||
def cleanText(string,custom_stopwords=None, custom_symbols=None, custom_words=None, customPreprocessing=None, lemmatize=False, normalize_synonyms=False):
|
||||
|
||||
# use preprocessing
|
||||
if customPreprocessing is not None:
|
||||
string = customPreprocessing(string)
|
||||
|
||||
|
||||
|
||||
if custom_stopwords is not None:
|
||||
custom_stopwords = custom_stopwords
|
||||
else:
|
||||
custom_stopwords = []
|
||||
|
||||
if custom_words is not None:
|
||||
custom_words = custom_words
|
||||
else:
|
||||
custom_words = []
|
||||
|
||||
if custom_symbols is not None:
|
||||
custom_symbols = custom_symbols
|
||||
else:
|
||||
custom_symbols = []
|
||||
|
||||
|
||||
# custom stoplist
|
||||
# https://stackoverflow.com/questions/9806963/how-to-use-pythons-import-function-properly-import
|
||||
stop_words = __import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS
|
||||
|
||||
stoplist =list(stop_words) + custom_stopwords
|
||||
# List of symbols we don't care about either
|
||||
symbols = ["-----","---","...","“","”",".","-","<",">",",","?","!","..","n’t","n't","|","||",";",":","…","’s","'s",".","(",")","[","]","#"] + custom_symbols
|
||||
|
||||
|
||||
|
||||
# get rid of newlines
|
||||
string = string.strip().replace("\n", " ").replace("\r", " ")
|
||||
|
||||
# replace twitter
|
||||
mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
|
||||
string = mentionFinder.sub("MENTION", string)
|
||||
|
||||
# replace emails
|
||||
emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
|
||||
string = emailFinder.sub("EMAIL", string)
|
||||
|
||||
# replace urls
|
||||
urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
|
||||
string = urlFinder.sub("URL", string)
|
||||
|
||||
# replace HTML symbols
|
||||
string = string.replace("&", "and").replace(">", ">").replace("<", "<")
|
||||
|
||||
|
||||
|
||||
|
||||
# parse with spaCy
|
||||
spacy_doc = PARSER(string)
|
||||
tokens = []
|
||||
|
||||
added_entities = ["WORK_OF_ART","ORG","PRODUCT", "LOC"]#,"PERSON"]
|
||||
added_POS = ["NOUN"]#, "NUM" ]#,"VERB","ADJ"] #IDEE NUM mit in den Corpus aufnehmen, aber fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013
|
||||
|
||||
# append Tokens to a list
|
||||
for tok in spacy_doc:
|
||||
if tok.pos_ in added_POS:
|
||||
if lemmatize:
|
||||
tokens.append(tok.lemma_.lower().strip())
|
||||
else:
|
||||
tokens.append(tok.text.lower().strip())
|
||||
|
||||
# add entities
|
||||
if tok.ent_type_ in added_entities:
|
||||
tokens.append(tok.text.lower())
|
||||
|
||||
|
||||
|
||||
# remove stopwords
|
||||
tokens = [tok for tok in tokens if tok not in stoplist]
|
||||
|
||||
# remove symbols
|
||||
tokens = [tok for tok in tokens if tok not in symbols]
|
||||
|
||||
# remove custom_words
|
||||
tokens = [tok for tok in tokens if tok not in custom_words]
|
||||
|
||||
# remove single characters
|
||||
tokens = [tok for tok in tokens if len(tok)>1]
|
||||
|
||||
# remove large strings of whitespace
|
||||
remove_large_strings_of_whitespace(" ".join(tokens))
|
||||
|
||||
|
||||
#idee abkürzungen auflösen (v.a. TU -> Technische Universität)
|
||||
|
||||
if normalize_synonyms:
|
||||
tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens]
|
||||
|
||||
return " ".join(tokens)
|
||||
|
||||
|
||||
def remove_large_strings_of_whitespace(sentence):
|
||||
|
||||
whitespaceFinder = re.compile(r'(\r\n|\r|\n)', re.IGNORECASE)
|
||||
sentence = whitespaceFinder.sub(" ", sentence)
|
||||
|
||||
tokenlist = sentence.split(" ")
|
||||
|
||||
while "" in tokenlist:
|
||||
tokenlist.remove("")
|
||||
while " " in tokenlist:
|
||||
tokenlist.remove(" ")
|
||||
|
||||
return " ".join(tokenlist)
|
||||
"""
|
||||
"""
|
||||
def generateFromXML(path2xml, textfield='Beschreibung', clean=False, normalize_Synonyms=False,lemmatize=False):
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
|
||||
root = tree.getroot()
|
||||
|
||||
for ticket in root:
|
||||
metadata = {}
|
||||
text = "ERROR"
|
||||
for field in ticket:
|
||||
if field.tag == textfield:
|
||||
if clean:
|
||||
text = cleanText_words(field.text,PARSER,normalize_synonyms=normalize_Synonyms,lemmatize=lemmatize)
|
||||
else:
|
||||
text = field.text
|
||||
else:
|
||||
#idee hier auch cleanen?
|
||||
metadata[field.tag] = field.text
|
||||
yield text, metadata
|
||||
"""
|
||||
|
||||
|
||||
LANGUAGE = 'de'
|
||||
#PARSER = de_core_news_md.load()
|
||||
PARSER = spacy.load(LANGUAGE)
|
||||
|
||||
from old.textCleaning import TextCleaner
|
||||
|
||||
cleaner = TextCleaner(parser=PARSER)
|
||||
|
||||
|
||||
def generateTextfromTicketXML(path2xml, textfield='Beschreibung', clean=False, normalize_Synonyms=False, lemmatize=False):
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
|
||||
root = tree.getroot()
|
||||
|
||||
|
||||
for ticket in root:
|
||||
text = "ERROR"
|
||||
for field in ticket:
|
||||
if field.tag == textfield:
|
||||
if clean:
|
||||
text = cleaner.normalizeSynonyms(cleaner.removeWords(cleaner.keepPOSandENT(field.text))) #,normalize_synonyms=normalize_Synonyms,lemmatize=lemmatize)
|
||||
else:
|
||||
text = field.text
|
||||
yield text
|
||||
|
||||
def generateMetadatafromTicketXML(path2xml, textfield='Beschreibung'):#,keys_to_clean=["Loesung","Zusammenfassung"]):
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
|
||||
|
||||
root = tree.getroot()
|
||||
|
||||
for ticket in root:
|
||||
metadata = {}
|
||||
for field in ticket:
|
||||
if field.tag != textfield:
|
||||
if field.tag == "Zusammenfassung":
|
||||
metadata[field.tag] = cleaner.removePunctuation(field.text)
|
||||
elif field.tag == "Loesung":
|
||||
metadata[field.tag] = cleaner.removeWhitespace(field.text)
|
||||
else:
|
||||
metadata[field.tag] = field.text
|
||||
|
||||
yield metadata
|
||||
|
||||
|
||||
|
||||
|
||||
"""
|
||||
def cleanText_symbols(string, parser=PARSER, custom_symbols=None, keep=None):
|
||||
|
||||
if custom_symbols is not None:
|
||||
custom_symbols = custom_symbols
|
||||
else:
|
||||
custom_symbols = []
|
||||
|
||||
if keep is not None:
|
||||
keep = keep
|
||||
else:
|
||||
keep = []
|
||||
|
||||
# List of symbols we don't care about
|
||||
symbols = ["-----","---","...","“","”",".","-","<",">",",","?","!","..","n’t","n't","|","||",";",":","…","’s","'s",".","(",")","[","]","#"] + custom_symbols
|
||||
|
||||
# parse with spaCy
|
||||
spacy_doc = parser(string)
|
||||
tokens = []
|
||||
|
||||
pos = ["NUM", "SPACE", "PUNCT"]
|
||||
for p in keep:
|
||||
pos.remove(p)
|
||||
|
||||
|
||||
# append Tokens to a list
|
||||
for tok in spacy_doc:
|
||||
if tok.pos_ not in pos and tok.text not in symbols:
|
||||
tokens.append(tok.text)
|
||||
|
||||
return " ".join(tokens)
|
||||
|
||||
def cleanText_words(string,parser=PARSER, custom_stopwords=None, custom_words=None, customPreprocessing=cleanText_symbols, lemmatize=False, normalize_synonyms=False):
|
||||
|
||||
# use preprocessing
|
||||
if customPreprocessing is not None:
|
||||
string = customPreprocessing(string)
|
||||
|
||||
if custom_stopwords is not None:
|
||||
custom_stopwords = custom_stopwords
|
||||
else:
|
||||
custom_stopwords = []
|
||||
|
||||
if custom_words is not None:
|
||||
custom_words = custom_words
|
||||
else:
|
||||
custom_words = []
|
||||
|
||||
|
||||
# custom stoplist
|
||||
# https://stackoverflow.com/questions/9806963/how-to-use-pythons-import-function-properly-import
|
||||
stop_words = __import__("spacy." + parser.lang, globals(), locals(), ['object']).STOP_WORDS
|
||||
|
||||
stoplist =list(stop_words) + custom_stopwords
|
||||
|
||||
# replace twitter
|
||||
mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
|
||||
string = mentionFinder.sub("MENTION", string)
|
||||
|
||||
# replace emails
|
||||
emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
|
||||
string = emailFinder.sub("EMAIL", string)
|
||||
|
||||
# replace urls
|
||||
urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
|
||||
string = urlFinder.sub("URL", string)
|
||||
|
||||
# replace HTML symbols
|
||||
string = string.replace("&", "and").replace(">", ">").replace("<", "<")
|
||||
|
||||
|
||||
|
||||
# parse with spaCy
|
||||
spacy_doc = parser(string)
|
||||
tokens = []
|
||||
|
||||
added_entities = ["WORK_OF_ART","ORG","PRODUCT", "LOC"]#,"PERSON"]
|
||||
added_POS = ["NOUN"]#, "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013
|
||||
|
||||
# append Tokens to a list
|
||||
for tok in spacy_doc:
|
||||
if tok.pos_ in added_POS:
|
||||
if lemmatize:
|
||||
tokens.append(tok.lemma_.lower().strip())
|
||||
else:
|
||||
tokens.append(tok.text.lower().strip())
|
||||
|
||||
# add entities
|
||||
if tok.ent_type_ in added_entities:
|
||||
tokens.append(tok.text.lower())
|
||||
|
||||
|
||||
|
||||
# remove stopwords
|
||||
tokens = [tok for tok in tokens if tok not in stoplist]
|
||||
|
||||
# remove custom_words
|
||||
tokens = [tok for tok in tokens if tok not in custom_words]
|
||||
|
||||
# remove single characters
|
||||
tokens = [tok for tok in tokens if len(tok)>1]
|
||||
|
||||
# remove large strings of whitespace
|
||||
#remove_whitespace(" ".join(tokens))
|
||||
|
||||
|
||||
#idee abkürzungen auflösen (v.a. TU -> Technische Universität): abkürzungsverezeichnis
|
||||
|
||||
if normalize_synonyms:
|
||||
tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens]
|
||||
|
||||
return " ".join(set(tokens))
|
||||
|
||||
def cleanText_removeWhitespace(sentence):
|
||||
whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE)
|
||||
sentence = whitespaceFinder.sub(" ", sentence)
|
||||
return sentence
|
||||
|
||||
#todo: preprocess pipe: removewhitespace, removePUNCT, resolveAbk, keepPOS, keepEnt, removeWords, normalizeSynonyms
|
||||
|
||||
|
||||
def getFirstSynonym(word, thesaurus_gen):
|
||||
|
||||
word = word.lower()
|
||||
|
||||
|
||||
# durch den thesaurrus iterieren
|
||||
for syn_block in thesaurus_gen: # syn_block ist eine liste mit Synonymen
|
||||
|
||||
for syn in syn_block:
|
||||
syn = syn.lower()
|
||||
if re.match(r'\A[\w-]+\Z', syn): # falls syn einzelwort ist
|
||||
if word == syn:
|
||||
return getHauptform(syn_block, word)
|
||||
else: # falls es ein satz ist
|
||||
if word in syn:
|
||||
return getHauptform(syn_block, word)
|
||||
return word # zur Not, das ursrpüngliche Wort zurückgeben
|
||||
|
||||
def getHauptform(syn_block, word, default_return_first_Syn=False):
|
||||
|
||||
for syn in syn_block:
|
||||
syn = syn.lower()
|
||||
|
||||
if "hauptform" in syn and len(syn.split(" ")) <= 2:
|
||||
# nicht ausgeben, falls es in Klammern steht
|
||||
for w in syn.split(" "):
|
||||
if not re.match(r'\([^)]+\)', w):
|
||||
return w
|
||||
|
||||
if default_return_first_Syn:
|
||||
# falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
|
||||
for w in syn_block:
|
||||
if not re.match(r'\([^)]+\)', w):
|
||||
return w
|
||||
return word # zur Not, das ursrpüngliche Wort zurückgeben
|
||||
"""
|
||||
|
||||
def printRandomDoc(textacyCorpus):
|
||||
print()
|
||||
|
||||
print("len(textacyCorpus) = %i" % len(textacyCorpus))
|
||||
randIndex = int((len(textacyCorpus) - 1) * random.random())
|
||||
print("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata))
|
||||
|
||||
print()
|
||||
|
||||
####################'####################'####################'####################'####################'##############
|
||||
# todo config-file
|
||||
|
||||
DATAPATH = "ticketSamples.xml"
|
||||
DATAPATH_thesaurus = "openthesaurus.csv"
|
||||
|
||||
|
||||
|
||||
normalize_Synonyms = True
|
||||
clean = True
|
||||
lemmatize = True
|
||||
|
||||
custom_words = ["grüßen", "fragen"]
|
||||
|
||||
####################'####################'####################'####################'####################'##############
|
||||
|
||||
|
||||
## files to textacy-corpi
|
||||
textacyCorpus = textacy.Corpus(PARSER)
|
||||
|
||||
print("add texts to textacy-corpi...")
|
||||
textacyCorpus.add_texts(texts=generateTextfromTicketXML(DATAPATH, normalize_Synonyms=normalize_Synonyms, clean=clean, lemmatize=lemmatize), metadatas=generateMetadatafromTicketXML(DATAPATH))
|
||||
|
||||
|
||||
#for txt, dic in generateFromXML(DATAPATH, normalize_Synonyms=normalize_Synonyms, clean=clean, lemmatize=lemmatize):
|
||||
# textacyCorpus.add_text(txt,dic)
|
||||
|
||||
|
||||
|
||||
for doc in textacyCorpus:
|
||||
print(doc.metadata)
|
||||
print(doc.text)
|
||||
|
||||
#print(textacyCorpus[2].text)
|
||||
#printRandomDoc(textacyCorpus)
|
||||
#print(textacyCorpus[len(textacyCorpus)-1].text)
|
||||
|
||||
|
||||
print()
|
||||
print()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
213
old/test.py
213
old/test.py
|
@ -1,213 +0,0 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import spacy
|
||||
import textacy
|
||||
from spacy.tokens import Doc
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import spacy
|
||||
import functools
|
||||
|
||||
import textacy
|
||||
|
||||
|
||||
class TextCleaner:
|
||||
|
||||
def __init__(self, parser, thesaurus=None, customClass_symbols=None, customClass_words=None, keep4All=None):
|
||||
"""
|
||||
:param parser: spacy-parser
|
||||
:param thesaurus: [[syn1, syn2, ...],[syn1, syn2, ...], ...]
|
||||
:param customClass_symbols:[str]
|
||||
:param customClass_words:[str]
|
||||
:param customClassPOS:[str]
|
||||
:param keep4All: [str]
|
||||
"""
|
||||
if thesaurus is None:
|
||||
DATAPATH_thesaurus = "openthesaurus.csv"
|
||||
|
||||
## !!!!!! list wichtig, da sonst nicht die gleichen Synonyme zurückgegeben werden, weil ein generator während der laufzeit pickt
|
||||
self.thesaurus = list(textacy.fileio.read_csv(DATAPATH_thesaurus, delimiter=";"))
|
||||
else:
|
||||
self.thesaurus = thesaurus
|
||||
|
||||
self.parser = parser
|
||||
|
||||
#self.whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE)
|
||||
self.mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
|
||||
self.emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
|
||||
self.urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
|
||||
|
||||
# to keep
|
||||
self.entities2keep = ["WORK_OF_ART", "ORG", "PRODUCT", "LOC"] # ,"PERSON"]
|
||||
self.pos2keep = ["NOUN"] # , "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013
|
||||
|
||||
"""
|
||||
|
||||
# to remove
|
||||
self.symbols = ["-----", "---", "...", "“", "”", ".", "-", "<", ">", ",", "?", "!", "..", "n’t", "n't", "|", "||",
|
||||
";", ":",
|
||||
"…", "’s", "'s", ".", "(", ")", "[", "]", "#"] + (customClass_symbols if customClass_symbols is not None else [])
|
||||
self.stop_words |