lauffähige version

This commit is contained in:
jannis.grundmann 2017-11-06 12:54:59 +01:00
parent ecc8c0c54a
commit 0a6a68b8aa
45 changed files with 826 additions and 1175985 deletions

383
backup.py
View File

@ -1,383 +0,0 @@
# -*- coding: utf-8 -*-
############# misc
def printlog(string, level="INFO"):
"""log and prints"""
print(string)
if level=="INFO":
logging.info(string)
elif level=="DEBUG":
logging.debug(string)
elif level == "WARNING":
logging.warning(string)
printlog("Load functions")
def compose(*functions):
def compose2(f, g):
return lambda x: f(g(x))
return functools.reduce(compose2, functions, lambda x: x)
def get_calling_function():
"""finds the calling function in many decent cases.
https://stackoverflow.com/questions/39078467/python-how-to-get-the-calling-function-not-just-its-name
"""
fr = sys._getframe(1) # inspect.stack()[1][0]
co = fr.f_code
for get in (
lambda:fr.f_globals[co.co_name],
lambda:getattr(fr.f_locals['self'], co.co_name),
lambda:getattr(fr.f_locals['cls'], co.co_name),
lambda:fr.f_back.f_locals[co.co_name], # nested
lambda:fr.f_back.f_locals['func'], # decorators
lambda:fr.f_back.f_locals['meth'],
lambda:fr.f_back.f_locals['f'],
):
try:
func = get()
except (KeyError, AttributeError):
pass
else:
if func.__code__ == co:
return func
raise AttributeError("func not found")
def printRandomDoc(textacyCorpus):
import random
print()
printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
randIndex = int((len(textacyCorpus) - 1) * random.random())
printlog("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata))
print()
############# load xml
def generateMainTextfromTicketXML(path2xml, main_textfield='Description'):
"""
generates strings from XML
:param path2xml:
:param main_textfield:
:param cleaning_function:
:yields strings
"""
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
root = tree.getroot()
for ticket in root:
for field in ticket:
if field.tag == main_textfield:
yield field.text
def generateMetadatafromTicketXML(path2xml, leave_out=['Description']):
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
root = tree.getroot()
for ticket in root:
metadata = {}
for field in ticket:
if field.tag not in leave_out:
metadata[field.tag] = field.text
yield metadata
############# load csv
def csv_to_contentStream(path2csv: str, content_collumn_name: str):
"""
:param path2csv: string
:param content_collumn_name: string
:return: string-generator
"""
stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8')
content_collumn = 0 # standardvalue
for i,lst in enumerate(stream):
if i == 0:
# look for desired column
for j,col in enumerate(lst):
if col == content_collumn_name:
content_collumn = j
else:
yield lst[content_collumn]
def csv_to_metaStream(path2csv: str, metalist: [str]):
"""
:param path2csv: string
:param metalist: list of strings
:return: dict-generator
"""
stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8')
content_collumn = 0 # standardvalue
metaindices = []
metadata_temp = {}
for i,lst in enumerate(stream):
if i == 0:
for j,col in enumerate(lst): # geht bestimmt effizienter... egal, weil passiert nur einmal
for key in metalist:
if key == col:
metaindices.append(j)
metadata_temp = dict(zip(metalist,metaindices)) # zB {'Subject' : 1, 'categoryName' : 3, 'Solution' : 10}
else:
metadata = metadata_temp.copy()
for key,value in metadata.items():
metadata[key] = lst[value]
yield metadata
############################################ Preprocessing ##############################################
############# on str-gen
def processTokens(tokens, funclist, parser):
# in:tokenlist, funclist
# out: tokenlist
for f in funclist:
# idee: funclist sortieren,s.d. erst alle string-methoden ausgeführt werden, dann wird geparesed, dann wird auf tokens gearbeitet, dann evtl. auf dem ganzen Doc
if 'bool' in str(f.__annotations__):
tokens = list(filter(f, tokens))
elif 'str' in str(f.__annotations__):
tokens = list(map(f, tokens)) # purer text
doc = parser(" ".join(tokens)) # neu parsen
tokens = [tok for tok in doc] # nur tokens
elif 'spacy.tokens.doc.Doc' in str(f.__annotations__):
#todo wirkt gefrickelt
doc = parser(" ".join(tok.lower_ for tok in tokens)) # geparsed
tokens = f(doc)
doc = parser(" ".join(tokens)) # geparsed
tokens = [tok for tok in doc] # nur tokens
else:
warnings.warn("Unknown Annotation while preprocessing. Function: {0}".format(str(f)))
return tokens
def processTextstream(textstream, funclist, parser=DE_PARSER):
"""
:param textstream: string-gen
:param funclist: [func]
:param parser: spacy-parser
:return: string-gen
"""
# input:str-stream output:str-stream
pipe = parser.pipe(textstream)
for doc in pipe:
tokens = []
for tok in doc:
tokens.append(tok)
tokens = processTokens(tokens,funclist,parser)
yield " ".join([tok.lower_ for tok in tokens])
def processDictstream(dictstream, funcdict, parser=DE_PARSER):
"""
:param dictstream: dict-gen
:param funcdict:
clean_in_meta = {
"Solution":funclist,
...
}
:param parser: spacy-parser
:return: dict-gen
"""
for dic in dictstream:
result = {}
for key, value in dic.items():
if key in funcdict:
doc = parser(value)
tokens = [tok for tok in doc]
funclist = funcdict[key]
tokens = processTokens(tokens,funclist,parser)
result[key] = " ".join([tok.lower_ for tok in tokens])
else:
result[key] = value
yield result
############# return bool
def keepPOS(pos_list) -> bool:
ret = lambda tok : tok.pos_ in pos_list
ret.__annotations__ = get_calling_function().__annotations__
return ret
def removePOS(pos_list)-> bool:
ret = lambda tok : tok.pos_ not in pos_list
ret.__annotations__ = get_calling_function().__annotations__
return ret
def removeWords(words, keep=None)-> bool:
if hasattr(keep, '__iter__'):
for k in keep:
try:
words.remove(k)
except ValueError:
pass
ret = lambda tok : tok.lower_ not in words
ret.__annotations__ = get_calling_function().__annotations__
return ret
def keepENT(ent_list) -> bool:
ret = lambda tok : tok.ent_type_ in ent_list
ret.__annotations__ = get_calling_function().__annotations__
return ret
def removeENT(ent_list) -> bool:
ret = lambda tok: tok.ent_type_ not in ent_list
ret.__annotations__ = get_calling_function().__annotations__
return ret
def remove_words_containing_Numbers() -> bool:
ret = lambda tok: not bool(re.search('\d', tok.lower_))
ret.__annotations__ = get_calling_function().__annotations__
return ret
def remove_words_containing_specialCharacters() -> bool:
ret = lambda tok: not bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./<>?]', tok.lower_))
ret.__annotations__ = get_calling_function().__annotations__
return ret
def remove_words_containing_topLVL() -> bool:
ret = lambda tok: not bool(re.search(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', tok.lower_))
ret.__annotations__ = get_calling_function().__annotations__
return ret
def lemmatizeWord(word,filepath=LEMMAS):
"""http://www.lexiconista.com/datasets/lemmatization/"""
for line in list(textacy.fileio.read_file_lines(filepath=filepath)):
if word.lower() == line.split()[1].strip().lower():
return line.split()[0].strip().lower()
return word.lower() # falls nix gefunden wurde
def lemmatize() -> str:
ret = lambda tok: lemmatizeWord(tok.lower_)
ret.__annotations__ = get_calling_function().__annotations__
return ret
############# return strings
mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE)
specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE)
hardSFinder = re.compile(r'[ß]', re.IGNORECASE)
def replaceEmails(replace_with="EMAIL") -> str:
ret = lambda tok : emailFinder.sub(replace_with, tok.lower_)
ret.__annotations__ = get_calling_function().__annotations__
return ret
def replaceURLs(replace_with="URL") -> str:
ret = lambda tok: textacy.preprocess.replace_urls(tok.lower_,replace_with=replace_with)
#ret = lambda tok: urlFinder.sub(replace_with,tok.lower_)
ret.__annotations__ = get_calling_function().__annotations__
return ret
def replaceSpecialChars(replace_with=" ") -> str:
ret = lambda tok: specialFinder.sub(replace_with,tok.lower_)
ret.__annotations__ = get_calling_function().__annotations__
return ret
def replaceTwitterMentions(replace_with="TWITTER_MENTION") -> str:
ret = lambda tok : mentionFinder.sub(replace_with,tok.lower_)
ret.__annotations__ = get_calling_function().__annotations__
return ret
def replaceNumbers(replace_with="NUMBER") -> str:
ret = lambda tok: textacy.preprocess.replace_numbers(tok.lower_, replace_with=replace_with)
ret.__annotations__ = get_calling_function().__annotations__
return ret
def replacePhonenumbers(replace_with="PHONENUMBER") -> str:
ret = lambda tok: textacy.preprocess.replace_phone_numbers(tok.lower_, replace_with=replace_with)
ret.__annotations__ = get_calling_function().__annotations__
return ret
def replaceHardS(replace_with="ss") -> str:
ret = lambda tok: hardSFinder.sub(replace_with,tok.lower_)
ret.__annotations__ = get_calling_function().__annotations__
return ret
def fixUnicode() -> str:
ret = lambda tok: textacy.preprocess.fix_bad_unicode(tok.lower_, normalization=u'NFC')
ret.__annotations__ = get_calling_function().__annotations__
return ret
def resolveAbbreviations():
pass #todo
#todo wörter mit len < 2 entfernen( vorher abkürzungen (v.a. tu und fh) auflösen) und > 35 oder 50 ("Reiserücktrittskostenversicherung)
############# return docs
def keepUniqeTokens() -> spacy.tokens.Doc:
ret = lambda doc: (set([tok.lower_ for tok in doc]))
ret.__annotations__ = get_calling_function().__annotations__
return ret
def lower() -> spacy.tokens.Doc:
ret = lambda doc: ([tok.lower_ for tok in doc])
ret.__annotations__ = get_calling_function().__annotations__
return ret
################################################################################################################

View File

@ -11,6 +11,9 @@ from scipy import *
import os
from preprocessing import removePOS
from preprocessing import filterTokens
csv.field_size_limit(sys.maxsize)
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
@ -24,11 +27,6 @@ with open(config_ini) as f:
config.read_file(f)
global REGEX_SPECIALCHAR
global WORDS
REGEX_SPECIALCHAR = r'[`\=~%^&*()_+\[\]{};\'"|</>]' #+r',.-\\:' #+r',.?!'
WORDS= {}
@ -113,15 +111,12 @@ def clean(stringstream,autocorrect=False):
string = re.sub(r'[ü]', "ue", string)
string = re.sub(r'[ä]', "ae", string)
# frage autocorrect?
#idee http://lexitron.nectec.or.th/public/COLING-2010_Beijing_China/POSTERS/pdf/POSTERS022.pdf
#frage autocorrect? idee http://lexitron.nectec.or.th/public/COLING-2010_Beijing_China/POSTERS/pdf/POSTERS022.pdf
if autocorrect:
string = " ".join([autocorrectWord(word) for word in string.split()])
yield string
def processDictstream(dictstream, funcdict, parser):
"""
@ -154,30 +149,21 @@ def processDictstream(dictstream, funcdict, parser):
result[key] = value
yield result
def filterTokens(tokens, funclist):
# in:tokenlist, funclist
# out: tokenlist
for f in funclist:
tokens = list(filter(f, tokens))
return tokens
def removePOS(pos_list):
return lambda tok: tok.pos_ not in pos_list
##################################################################################################
ressources_path = FILEPATH + "ressources/"
path2wordsdict = FILEPATH + config.get("spellchecking", "pickle_file")
path2wordsdict = ressources_path + config.get("spellchecking", "pickle_file")
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
corpus_en_path = FILEPATH + config.get("en_corpus", "path")
autocorrect = config.getboolean("preprocessing", "autocorrect")
def cleanCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", printrandom=10):
def cleanCorpus(corpus_path, clean_in_meta, lang="de", printrandom=10,autocorrect=False):
logprint("Clean {0}_corpus at {1}".format(lang, datetime.now()))
@ -192,7 +178,7 @@ def cleanCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", printrando
## process and add files to textacy-corpi,
clean_corpus.add_texts(
clean(corpus2Text(raw_corpus)),
clean(corpus2Text(raw_corpus),autocorrect=autocorrect),
processDictstream(corpus2Meta(raw_corpus), clean_in_meta,parser=parser)
)
@ -220,8 +206,6 @@ def main():
WORDS = load_obj(path2wordsdict)
clean_in_content = [] #frage notwendig?
clean_in_meta = {
"Solution": [removePOS(["SPACE"])],
@ -229,7 +213,7 @@ def main():
"categoryName": [removePOS(["SPACE", "PUNCT"])]
}
corpus = cleanCorpus(corpus_de_path, clean_in_content, clean_in_meta, "de",printrandom=5 )
corpus = cleanCorpus(corpus_de_path, clean_in_meta, "de",printrandom=5, autocorrect=autocorrect )
end = time.time()
logprint("Time Elapsed Cleaning:{0} min".format((end - start) / 60))

View File

@ -1,24 +0,0 @@
Index: 0
Text: lieber support, ich habe gerade versucht mich mit meiner unicard im firefox browser fuer das service portal zu authentifizieren. das hat vor einigen wochen noch tadelos geklappt und mittlerweile bekomme ich folgende fehlermeldung ich hoffe sie koennen mir weiterhelfen. vielen dank und viele gruesse sascha feldhorst dipl. inform. sascha feldhorst wiss. ang. technische universitaet dortmund maschinenbau lehrstuhl fuer foerder und lagerwesen logistikcampus joseph von fraunhofer str. 2 4 d 44227 dortmund tel. 49 231 755 40 73 fax 49 231 755 47 68 mailto sascha.feldhorst@tu dortmund.de sascha.feldhorst@tu dortmund.de http www.flw.mb.tu dortmund.de www.flw.mb.tu dortmund.de wichtiger hinweis die information in dieser e mail ist vertraulich. sie ist ausschliesslich fuer den adressaten bestimmt. sollten sie nicht der fuer diese e mail bestimmte adressat sein, unterrichten sie bitte den absender und vernichten sie diese mail. vielen dank. unbeschadet der korrespondenz per e mail, sind unsere erklaerungen ausschliesslich final rechtsverbindlich, wenn sie in herkoemmlicher schriftform mit eigenhaendiger unterschrift oder durch uebermittlung eines solchen schriftstuecks per telefax erfolgen. important note the information included in this e mail is confidential. it is solely intended for the recipient. if you are not the intended recipient of this e mail please contact the sender and delete this message. thank you. without prejudice of e mail correspondence, our statements are only legally binding when they are made in the conventional written form with personal signature or when such documents are sent by fax.
categoryName: betrieb
Index: 0
Text: support browser service portal mittlerweile
categoryName: betrieb
Index: 1
Text: telefon umzug antragsteller astrid gramm astrid.gramm@tu dortmund.de terminvorschlag 14.08.2015 einrichtung dezernat 2 abteilung 2.5 psp element uniaccount mnichofm hofmann, nicole gebaeude dezernat 5 raum id 201 651430 telefondose neztwerkdose dt04.5 04.6 telefonnr. 4821 eintrag telefonbuch e mail astrid.gramm@tu dortmund.de voicemail ansprechpartner astrid gramm tel. ansprechpartner 5444 verantwortlicher nutzer type bemerkung frau hofmann wird am 14.08.2015 in die wd 2 umziehen. es ist der raum 201a im og nicht 201 eine bezeichnung der telefondose ist nicht vorhanden.
categoryName: elektronisches telefonbuch
Index: 1
Text: telefon umzug antragsteller gramm einrichtung dezernat abteilung element gebaeude dezernat raum id eintrag telefonbuch mail ansprechpartner gramm ansprechpartner verantwortlicher nutzer type bemerkung raum bezeichnung
categoryName: elektronisches telefonbuch

View File

@ -37,12 +37,12 @@ pickle_file=en_stopwords_list.pkl
[logging]
level=INFO
filename=topicModelTickets.log
filename=log/topicModelTickets.log
[de_corpus]
input=M42-Export/Tickets_small.csv
#input=M42-Export/de_tickets.csv
#input=M42-Export/Tickets_small.csv
input=M42-Export/de_tickets.csv
path=corpi/
@ -64,7 +64,10 @@ metaliste=TicketNumber,Subject,CreatedDate,categoryName,Impact,Urgency,BenutzerI
#ents2keep=WORK_OF_ART,ORG,PRODUCT,LOC
custom_words=eintrag,element,nutzer,einrichtung,abteilung,gebaeude,raum,ansprechpartner,geehrt,dr,not,frage,betreff,gerne,dame,herr,frau,hilfe,moeglichkeit,beste,freuen,voraus,problem,lauten,bedanken,voraus,hallo,gerne,freundlich,fragen,fehler,bitten,ehre,lieb,liebe,gruesse,helfen,versuchen,unbestimmt,woche,tadelos,klappen,mittlerweile,bekommen,erreichbar,gruss,auffahren,vorgang,hinweis,name,gruss,id,erfolg,folge,team,absender,versenden,vorname,strasse,prozess,portal,moeglichkeit,fremd,wende,rueckfrage,stehen,verfuegung,funktionieren,pruefen,hoffen,ok
autocorrect = false
#true
custom_words=aenderung,hahn,verantwortlicher,rolle,status,fehlgeschlagen,aenderung,test,erwuenscht,antragsteller,bemerkung,tu,uni,prof,bezeichnung,gramm,type,eintrag,element,nutzer,einrichtung,abteilung,gebaeude,raum,ansprechpartner,geehrt,dr,not,frage,betreff,gerne,dame,herr,frau,hilfe,moeglichkeit,beste,freuen,voraus,problem,lauten,bedanken,voraus,hallo,gerne,freundlich,fragen,fehler,bitten,ehre,lieb,liebe,gruesse,helfen,versuchen,unbestimmt,woche,tadelos,klappen,mittlerweile,bekommen,erreichbar,gruss,auffahren,vorgang,hinweis,name,gruss,id,erfolg,folge,team,absender,versenden,vorname,strasse,prozess,portal,moeglichkeit,fremd,wende,rueckfrage,stehen,verfuegung,funktionieren,pruefen,hoffen,ok,januar,februar,maerz,april,mai,juni,juli,august,september,oktober,november,dezember

View File

@ -23,8 +23,6 @@ with open(config_ini) as f:
def ticketcsv_to_textStream(path2csv: str, content_collumn_name: str):
"""
:param path2csv: string
@ -75,27 +73,9 @@ def ticket_csv_to_DictStream(path2csv: str, metalist: [str]):
##################################################################################################
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corporization.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_corporization.log &"
"""
content_collumn_name = "Description"
metaliste = [
"TicketNumber",
"Subject",
"CreatedDate",
"categoryName",
"Impact",
"Urgency",
"BenutzerID",
"VerantwortlicherID",
"EigentuemerID",
"Solution"
]
"""
content_collumn_name = config.get("tickets","content_collumn_name")
metaliste = list(map(normalize_whitespace,config.get("tickets","metaliste").split(",")))
metaliste = get_list_from_config("tickets","metaliste")
path2de_csv = FILEPATH + config.get("de_corpus","input")
@ -110,7 +90,6 @@ corpus_en_path = FILEPATH + config.get("en_corpus", "path")
def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, metaliste, lang, printrandom=0):
# print paths
path_csv_split = path2_csv.split("/")
filename = path_csv_split[len(path_csv_split) - 1]
@ -121,8 +100,6 @@ def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, metaliste, la
raw_corpus = textacy.Corpus(lang)
## add files to textacy-corpi,
#printlog("Add texts to {0}_textacy-corpi".format(lang))
raw_corpus.add_texts(
ticketcsv_to_textStream(path2_csv, content_collumn_name),
ticket_csv_to_DictStream(path2_csv, metaliste)
@ -132,6 +109,7 @@ def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, metaliste, la
# leere docs aus corpi kicken
raw_corpus.remove(lambda doc: len(doc) == 0)
logprint("corpus-lenght: {}".format(len(raw_corpus)))
#random Doc printen
for i in range(printrandom):
printRandomDoc(raw_corpus)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

33
init.py
View File

@ -237,36 +237,37 @@ def build_words_for_spellchecking(path2words):
##################################################################################################
# THESAURUS
path2wordnet = FILEPATH + config.get("thesaurus","input")
path2thesaurus_dict = FILEPATH + config.get("thesaurus","pickle_file")
ressources_path = FILEPATH + "ressources/"
path2wordnet = ressources_path + config.get("thesaurus","input")
path2thesaurus_dict = ressources_path + config.get("thesaurus","pickle_file")
# SPELLCHECKING
path2words_file = FILEPATH + config.get("spellchecking","input")
path2wordlist = FILEPATH + config.get("spellchecking","pickle_file")
path2words_file = ressources_path + config.get("spellchecking","input")
path2wordlist = ressources_path + config.get("spellchecking","pickle_file")
# LEMMA
path2lemma_file = FILEPATH + config.get("lemmatization","input")
path2lemmadict = FILEPATH + config.get("lemmatization","pickle_file")
path2lemma_file = ressources_path + config.get("lemmatization","input")
path2lemmadict = ressources_path + config.get("lemmatization","pickle_file")
# NOMEN
nouns1 = FILEPATH + config.get("nouns","input1")
nouns2 = FILEPATH + config.get("nouns","input2")
path2nouns_list = FILEPATH + config.get("nouns","pickle_file")
nouns1 = ressources_path + config.get("nouns","input1")
nouns2 = ressources_path + config.get("nouns","input2")
path2nouns_list = ressources_path + config.get("nouns","pickle_file")
# VORNAMEN
firstnames_txt = FILEPATH + config.get("firstnames","input")
path2firstnameslist = FILEPATH + config.get("firstnames","pickle_file")
firstnames_txt = ressources_path + config.get("firstnames","input")
path2firstnameslist = ressources_path + config.get("firstnames","pickle_file")
# STOPWORDS
stop1 = FILEPATH + config.get("de_stopwords","input1")
stop2 = FILEPATH + config.get("de_stopwords","input2")
stop3 = FILEPATH + config.get("de_stopwords","input3")
path2stopwordlist_de = FILEPATH + config.get("de_stopwords","pickle_file")
stop1 = ressources_path + config.get("de_stopwords","input1")
stop2 = ressources_path + config.get("de_stopwords","input2")
stop3 = ressources_path + config.get("de_stopwords","input3")
path2stopwordlist_de = ressources_path + config.get("de_stopwords","pickle_file")
path2stopwordlist_en = FILEPATH + config.get("en_stopwords","pickle_file")
path2stopwordlist_en = ressources_path + config.get("en_stopwords","pickle_file")

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

21
main.py
View File

@ -11,12 +11,12 @@ import cleaning
from miscellaneous import *
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/main.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_main.log &"
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/main.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/log/printout_main.log &"
start = time.time()
#init.main()
init.main()
logprint("")
corporization.main()
@ -30,32 +30,23 @@ logprint("")
"""
topicModeling.main(use_raw=False,algorithm="lsa")
#topicModeling.main(use_cleaned=False,algorithm="lsa")
logprint("")
topicModeling.main(use_raw=False,algorithm="lda")
#topicModeling.main(use_cleaned=False,algorithm="nmf")
logprint("")
topicModeling.main(use_raw=False,algorithm="nmf")
#topicModeling.main(use_cleaned=False,algorithm="lda")
logprint("")
topicModeling.main(use_raw=False,algorithm="llda")
topicModeling.main(use_cleaned=False,algorithm="llda")
logprint("")
"""
logprint("")
end = time.time()
logprint("Total Time Elapsed: {0} min".format((end - start) / 60))

View File

@ -153,6 +153,25 @@ def printRandomDoc(textacyCorpus):
print()
def get_list_from_config(section,option):
return list(map(textacy.preprocess.normalize_whitespace,config.get(section,option).split(",")))
def corpus2Text(corpus):
for doc in corpus:
yield doc.text
def corpus2Meta(corpus):
for doc in corpus:
yield doc.metadata
def savelabledCorpiLines(corpus,filepath):
textacy.fileio.write_file_lines(gen_labledLines(corpus), filepath=filepath)
def gen_labledLines(corpus):
for doc in corpus:
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpi
yield "[" + doc.metadata["categoryName"] + "] " + doc.text
def save_corpus(corpus, corpus_path, corpus_name):
@ -219,95 +238,6 @@ def load_corpus(corpus_path, corpus_name, lang="de"):
for key,value in plain.items():
if key != "content" and key != "index":
meta[key] = value
corpus.add_doc(textacy.Doc(plain["content"], lang=corpus.spacy_lang, metadata=meta))
return corpus, corpus.spacy_lang
"""
def corpus2Text(corpus):
for doc in corpus:
yield doc.text
def corpus2Meta(corpus):
for doc in corpus:
yield doc.metadata
def saveplaincorpustext(corpus,path):
textacy.fileio.write_file_lines(corpus2Text(corpus),filepath=path )
def save_corpusV2(corpus, corpus_path, corpus_name):
# save parser
parser = corpus.spacy_lang
parserpath = corpus_path + str(parser.lang) + '_parser'
parser.save_to_directory(parserpath)
contentpath = corpus_path +corpus_name + "_docs/"
if not os.path.exists(contentpath):
os.makedirs(contentpath)
for doc in corpus:
with open(contentpath + str(doc.corpus_index) + "_doc.bin", 'w') as f:
f.write(doc.spacy_doc.to_bytes())
with open(contentpath + str(doc.corpus_index) + "_meta.json", 'w') as file:
file.write(json.dumps(doc.metadata))
def load_corpusV2(corpus_path, corpus_name, lang="de"):
# ckeck for language
if "de_" in corpus_name:
lang = "de"
elif "en_" in corpus_name:
lang = "en"
# load parser
parser = spacy.load(lang)
stringstorepath = corpus_path + str(lang) + '_parser' + '/vocab/strings.json'
with open(stringstorepath) as file:
parser.vocab.strings.load(file)
vocabpath = Path(corpus_path + str(lang) + '_parser' + '/vocab/lexemes.bin')
parser.vocab.load_lexemes(vocabpath)
# load corpus
corpus = textacy.Corpus(parser)
contentpath = corpus_path + corpus_name + "_docs/"
docs = yield_fromdir(contentpath,spacy_vocab=corpus.spacy_vocab,type="doc")
metas = yield_fromdir(contentpath,type="meta")
for doc,meta in zip(docs,metas):
corpus.add_doc(
textacy.Doc(doc, lang=corpus.spacy_lang, metadata=meta))
return corpus, corpus.spacy_lang
def yield_fromdir(path,spacy_vocab=None,type=".pkl"):
os.chdir(path)
filelist = [name for name in os.listdir('.') if os.path.isfile(name)]
filelist = [filename for filename in filelist if type in filename]
filelist.sort(key = lambda elem : elem.split("_")[0])
if type =='doc':
for filename in filelist:
with open(path+filename,'r') as f:
for bytes_string in SpacyDoc.read_bytes(f):
yield SpacyDoc(spacy_vocab).from_bytes(bytes_string)
elif type == 'meta':
for filename in filelist:
with open(path+filename,'r') as f:
yield json.load(f)
else:
for filename in filelist:
yield load_obj(path+filename)
"""
corpus.add_doc(textacy.Doc(plain["content"], lang=corpus.spacy_lang, metadata=meta))
return corpus, corpus.spacy_lang

View File

@ -1,466 +0,0 @@
# -*- coding: utf-8 -*-
import csv
import random
import sys
import spacy
import textacy
"""
import keras
import numpy as np
from keras.layers import Dense, SimpleRNN, LSTM, TimeDistributed, Dropout
from keras.models import Sequential
import keras.backend as K
"""
csv.field_size_limit(sys.maxsize)
"""
def getFirstSynonym(word, thesaurus_gen):
word = word.lower()
# TODO word cleaning https://stackoverflow.com/questions/3939361/remove-specific-characters-from-a-string-in-python
# durch den thesaurrus iterieren
for syn_block in thesaurus_gen: # syn_block ist eine liste mit Synonymen
# durch den synonymblock iterieren
for syn in syn_block:
syn = syn.lower().split(" ") if not re.match(r'\A[\w-]+\Z', syn) else syn # aus synonym mach liste (um evtl. sätze zu identifieziren)
# falls das wort in dem synonym enthalten ist (also == einem Wort in der liste ist)
if word in syn:
# Hauptform suchen
if "auptform" in syn:
# nicht ausgeben, falls es in Klammern steht
for w in syn:
if not re.match(r'\([^)]+\)', w) and w is not None:
return w
# falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
if len(syn) == 1:
w = syn[0]
if not re.match(r'\([^)]+\)', w) and w is not None:
return w
return word # zur Not die eingabe ausgeben
"""
"""
def cleanText(string,custom_stopwords=None, custom_symbols=None, custom_words=None, customPreprocessing=None, lemmatize=False, normalize_synonyms=False):
# use preprocessing
if customPreprocessing is not None:
string = customPreprocessing(string)
if custom_stopwords is not None:
custom_stopwords = custom_stopwords
else:
custom_stopwords = []
if custom_words is not None:
custom_words = custom_words
else:
custom_words = []
if custom_symbols is not None:
custom_symbols = custom_symbols
else:
custom_symbols = []
# custom stoplist
# https://stackoverflow.com/questions/9806963/how-to-use-pythons-import-function-properly-import
stop_words = __import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS
stoplist =list(stop_words) + custom_stopwords
# List of symbols we don't care about either
symbols = ["-----","---","...","","",".","-","<",">",",","?","!","..","nt","n't","|","||",";",":","","s","'s",".","(",")","[","]","#"] + custom_symbols
# get rid of newlines
string = string.strip().replace("\n", " ").replace("\r", " ")
# replace twitter
mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
string = mentionFinder.sub("MENTION", string)
# replace emails
emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
string = emailFinder.sub("EMAIL", string)
# replace urls
urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
string = urlFinder.sub("URL", string)
# replace HTML symbols
string = string.replace("&amp;", "and").replace("&gt;", ">").replace("&lt;", "<")
# parse with spaCy
spacy_doc = PARSER(string)
tokens = []
added_entities = ["WORK_OF_ART","ORG","PRODUCT", "LOC"]#,"PERSON"]
added_POS = ["NOUN"]#, "NUM" ]#,"VERB","ADJ"] #IDEE NUM mit in den Corpus aufnehmen, aber fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013
# append Tokens to a list
for tok in spacy_doc:
if tok.pos_ in added_POS:
if lemmatize:
tokens.append(tok.lemma_.lower().strip())
else:
tokens.append(tok.text.lower().strip())
# add entities
if tok.ent_type_ in added_entities:
tokens.append(tok.text.lower())
# remove stopwords
tokens = [tok for tok in tokens if tok not in stoplist]
# remove symbols
tokens = [tok for tok in tokens if tok not in symbols]
# remove custom_words
tokens = [tok for tok in tokens if tok not in custom_words]
# remove single characters
tokens = [tok for tok in tokens if len(tok)>1]
# remove large strings of whitespace
remove_large_strings_of_whitespace(" ".join(tokens))
#idee abkürzungen auflösen (v.a. TU -> Technische Universität)
if normalize_synonyms:
tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens]
return " ".join(tokens)
def remove_large_strings_of_whitespace(sentence):
whitespaceFinder = re.compile(r'(\r\n|\r|\n)', re.IGNORECASE)
sentence = whitespaceFinder.sub(" ", sentence)
tokenlist = sentence.split(" ")
while "" in tokenlist:
tokenlist.remove("")
while " " in tokenlist:
tokenlist.remove(" ")
return " ".join(tokenlist)
"""
"""
def generateFromXML(path2xml, textfield='Beschreibung', clean=False, normalize_Synonyms=False,lemmatize=False):
import xml.etree.ElementTree as ET
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
root = tree.getroot()
for ticket in root:
metadata = {}
text = "ERROR"
for field in ticket:
if field.tag == textfield:
if clean:
text = cleanText_words(field.text,PARSER,normalize_synonyms=normalize_Synonyms,lemmatize=lemmatize)
else:
text = field.text
else:
#idee hier auch cleanen?
metadata[field.tag] = field.text
yield text, metadata
"""
LANGUAGE = 'de'
#PARSER = de_core_news_md.load()
PARSER = spacy.load(LANGUAGE)
from old.textCleaning import TextCleaner
cleaner = TextCleaner(parser=PARSER)
def generateTextfromTicketXML(path2xml, textfield='Beschreibung', clean=False, normalize_Synonyms=False, lemmatize=False):
import xml.etree.ElementTree as ET
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
root = tree.getroot()
for ticket in root:
text = "ERROR"
for field in ticket:
if field.tag == textfield:
if clean:
text = cleaner.normalizeSynonyms(cleaner.removeWords(cleaner.keepPOSandENT(field.text))) #,normalize_synonyms=normalize_Synonyms,lemmatize=lemmatize)
else:
text = field.text
yield text
def generateMetadatafromTicketXML(path2xml, textfield='Beschreibung'):#,keys_to_clean=["Loesung","Zusammenfassung"]):
import xml.etree.ElementTree as ET
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
root = tree.getroot()
for ticket in root:
metadata = {}
for field in ticket:
if field.tag != textfield:
if field.tag == "Zusammenfassung":
metadata[field.tag] = cleaner.removePunctuation(field.text)
elif field.tag == "Loesung":
metadata[field.tag] = cleaner.removeWhitespace(field.text)
else:
metadata[field.tag] = field.text
yield metadata
"""
def cleanText_symbols(string, parser=PARSER, custom_symbols=None, keep=None):
if custom_symbols is not None:
custom_symbols = custom_symbols
else:
custom_symbols = []
if keep is not None:
keep = keep
else:
keep = []
# List of symbols we don't care about
symbols = ["-----","---","...","","",".","-","<",">",",","?","!","..","nt","n't","|","||",";",":","","s","'s",".","(",")","[","]","#"] + custom_symbols
# parse with spaCy
spacy_doc = parser(string)
tokens = []
pos = ["NUM", "SPACE", "PUNCT"]
for p in keep:
pos.remove(p)
# append Tokens to a list
for tok in spacy_doc:
if tok.pos_ not in pos and tok.text not in symbols:
tokens.append(tok.text)
return " ".join(tokens)
def cleanText_words(string,parser=PARSER, custom_stopwords=None, custom_words=None, customPreprocessing=cleanText_symbols, lemmatize=False, normalize_synonyms=False):
# use preprocessing
if customPreprocessing is not None:
string = customPreprocessing(string)
if custom_stopwords is not None:
custom_stopwords = custom_stopwords
else:
custom_stopwords = []
if custom_words is not None:
custom_words = custom_words
else:
custom_words = []
# custom stoplist
# https://stackoverflow.com/questions/9806963/how-to-use-pythons-import-function-properly-import
stop_words = __import__("spacy." + parser.lang, globals(), locals(), ['object']).STOP_WORDS
stoplist =list(stop_words) + custom_stopwords
# replace twitter
mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
string = mentionFinder.sub("MENTION", string)
# replace emails
emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
string = emailFinder.sub("EMAIL", string)
# replace urls
urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
string = urlFinder.sub("URL", string)
# replace HTML symbols
string = string.replace("&amp;", "and").replace("&gt;", ">").replace("&lt;", "<")
# parse with spaCy
spacy_doc = parser(string)
tokens = []
added_entities = ["WORK_OF_ART","ORG","PRODUCT", "LOC"]#,"PERSON"]
added_POS = ["NOUN"]#, "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013
# append Tokens to a list
for tok in spacy_doc:
if tok.pos_ in added_POS:
if lemmatize:
tokens.append(tok.lemma_.lower().strip())
else:
tokens.append(tok.text.lower().strip())
# add entities
if tok.ent_type_ in added_entities:
tokens.append(tok.text.lower())
# remove stopwords
tokens = [tok for tok in tokens if tok not in stoplist]
# remove custom_words
tokens = [tok for tok in tokens if tok not in custom_words]
# remove single characters
tokens = [tok for tok in tokens if len(tok)>1]
# remove large strings of whitespace
#remove_whitespace(" ".join(tokens))
#idee abkürzungen auflösen (v.a. TU -> Technische Universität): abkürzungsverezeichnis
if normalize_synonyms:
tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens]
return " ".join(set(tokens))
def cleanText_removeWhitespace(sentence):
whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE)
sentence = whitespaceFinder.sub(" ", sentence)
return sentence
#todo: preprocess pipe: removewhitespace, removePUNCT, resolveAbk, keepPOS, keepEnt, removeWords, normalizeSynonyms
def getFirstSynonym(word, thesaurus_gen):
word = word.lower()
# durch den thesaurrus iterieren
for syn_block in thesaurus_gen: # syn_block ist eine liste mit Synonymen
for syn in syn_block:
syn = syn.lower()
if re.match(r'\A[\w-]+\Z', syn): # falls syn einzelwort ist
if word == syn:
return getHauptform(syn_block, word)
else: # falls es ein satz ist
if word in syn:
return getHauptform(syn_block, word)
return word # zur Not, das ursrpüngliche Wort zurückgeben
def getHauptform(syn_block, word, default_return_first_Syn=False):
for syn in syn_block:
syn = syn.lower()
if "hauptform" in syn and len(syn.split(" ")) <= 2:
# nicht ausgeben, falls es in Klammern steht
for w in syn.split(" "):
if not re.match(r'\([^)]+\)', w):
return w
if default_return_first_Syn:
# falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
for w in syn_block:
if not re.match(r'\([^)]+\)', w):
return w
return word # zur Not, das ursrpüngliche Wort zurückgeben
"""
def printRandomDoc(textacyCorpus):
print()
print("len(textacyCorpus) = %i" % len(textacyCorpus))
randIndex = int((len(textacyCorpus) - 1) * random.random())
print("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata))
print()
####################'####################'####################'####################'####################'##############
# todo config-file
DATAPATH = "ticketSamples.xml"
DATAPATH_thesaurus = "openthesaurus.csv"
normalize_Synonyms = True
clean = True
lemmatize = True
custom_words = ["grüßen", "fragen"]
####################'####################'####################'####################'####################'##############
## files to textacy-corpi
textacyCorpus = textacy.Corpus(PARSER)
print("add texts to textacy-corpi...")
textacyCorpus.add_texts(texts=generateTextfromTicketXML(DATAPATH, normalize_Synonyms=normalize_Synonyms, clean=clean, lemmatize=lemmatize), metadatas=generateMetadatafromTicketXML(DATAPATH))
#for txt, dic in generateFromXML(DATAPATH, normalize_Synonyms=normalize_Synonyms, clean=clean, lemmatize=lemmatize):
# textacyCorpus.add_text(txt,dic)
for doc in textacyCorpus:
print(doc.metadata)
print(doc.text)
#print(textacyCorpus[2].text)
#printRandomDoc(textacyCorpus)
#print(textacyCorpus[len(textacyCorpus)-1].text)
print()
print()

View File

@ -1,213 +0,0 @@
# -*- coding: utf-8 -*-
import spacy
import textacy
from spacy.tokens import Doc
# -*- coding: utf-8 -*-
import re
import spacy
import functools
import textacy
class TextCleaner:
def __init__(self, parser, thesaurus=None, customClass_symbols=None, customClass_words=None, keep4All=None):
"""
:param parser: spacy-parser
:param thesaurus: [[syn1, syn2, ...],[syn1, syn2, ...], ...]
:param customClass_symbols:[str]
:param customClass_words:[str]
:param customClassPOS:[str]
:param keep4All: [str]
"""
if thesaurus is None:
DATAPATH_thesaurus = "openthesaurus.csv"
## !!!!!! list wichtig, da sonst nicht die gleichen Synonyme zurückgegeben werden, weil ein generator während der laufzeit pickt
self.thesaurus = list(textacy.fileio.read_csv(DATAPATH_thesaurus, delimiter=";"))
else:
self.thesaurus = thesaurus
self.parser = parser
#self.whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE)
self.mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
self.emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
self.urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
# to keep
self.entities2keep = ["WORK_OF_ART", "ORG", "PRODUCT", "LOC"] # ,"PERSON"]
self.pos2keep = ["NOUN"] # , "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013
"""
# to remove
self.symbols = ["-----", "---", "...", "", "", ".", "-", "<", ">", ",", "?", "!", "..", "nt", "n't", "|", "||",
";", ":",
"", "s", "'s", ".", "(", ")", "[", "]", "#"] + (customClass_symbols if customClass_symbols is not None else [])
self.stop_words