From 86ee5d7fba930e4c4b53e5faef3070c186a6f69d Mon Sep 17 00:00:00 2001 From: "jannis.grundmann" Date: Thu, 31 Aug 2017 14:54:01 +0200 Subject: [PATCH 1/9] textcleaning verfeinert --- openthesaurus.csv | 4 +- preprocessing.py | 354 ++++++++++++++++++++++++++++++++++++---------- test.py | 220 ++++++++++++++++------------ 3 files changed, 411 insertions(+), 167 deletions(-) diff --git a/openthesaurus.csv b/openthesaurus.csv index a2348f7..caad708 100644 --- a/openthesaurus.csv +++ b/openthesaurus.csv @@ -1,3 +1,5 @@ +TH;Technische_Universität (Hauptform);Technische Hochschule;TU +Passwort (Hauptform);Kodewort;Schlüsselwort;Zugangscode;Kennwort (Hauptform);Geheimcode;Losung;Codewort;Zugangswort;Losungswort;Parole Fission;Kernfission;Kernspaltung;Atomspaltung Wiederaufnahme;Fortführung davonfahren;abdüsen (ugs.);aufbrechen;abfliegen;abfahren;(von etwas) fortfahren;abreisen;wegfahren;wegfliegen @@ -2182,7 +2184,6 @@ Spitzenklöppel (Handarbeit);Glockenklöppel;Klöppel gutartig;benigne (fachspr.) Beutelratte;Taschenratte rollen;kollern (ugs.);kullern;kugeln -Kodewort;Schlüsselwort;Zugangscode;Kennwort (Hauptform);Geheimcode;Losung;Codewort;Zugangswort;Passwort (Hauptform);Losungswort;Parole packen;einpacken Ratschluss;Urteil;Wille;Entscheidung;Entschlossenheit;Beschluss;das letzte Wort (ugs.);Entschluss;Entscheid (schweiz.) dreckig machen;versiffen;beschmutzen;verschmutzen @@ -4207,7 +4208,6 @@ Akzise;Oktroi;Verbrauchsabgabe Aufrührer;Tumultant genügsam;bedürfnislos zeigen;offenbaren;bekunden;kundtun -TH;Technische Universität;Technische Hochschule;TU Versprechen;Absichtserklärung (Nachrichtensprache);Zusicherung;Versicherung;Beteuerung Beschaulichkeit;Stille Auswärtiges Amt;Außenamt (ugs.);Außenministerium (ugs.);AA;Ministerium für Auswärtige Angelegenheiten diff --git a/preprocessing.py b/preprocessing.py index e9f5275..536f426 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -17,15 +17,7 @@ import keras.backend as K """ csv.field_size_limit(sys.maxsize) - -def printRandomDoc(textacyCorpus): - print() - print("len(textacyCorpus) = %i" % len(textacyCorpus)) - randIndex = int((len(textacyCorpus) - 1) * random.random()) - print("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata)) - print() - - +""" def getFirstSynonym(word, thesaurus_gen): word = word.lower() @@ -58,8 +50,9 @@ def getFirstSynonym(word, thesaurus_gen): return word # zur Not die eingabe ausgeben -def cleanText(string,custom_stopwords=None, custom_symbols=None, custom_words=None, customPreprocessing=None, lemmatize=False): - import re +""" +""" +def cleanText(string,custom_stopwords=None, custom_symbols=None, custom_words=None, customPreprocessing=None, lemmatize=False, normalize_synonyms=False): # use preprocessing if customPreprocessing is not None: @@ -119,7 +112,7 @@ def cleanText(string,custom_stopwords=None, custom_symbols=None, custom_words=No tokens = [] added_entities = ["WORK_OF_ART","ORG","PRODUCT", "LOC"]#,"PERSON"] - added_POS = ["NOUN", "NUM" ]#,"VERB","ADJ"] #IDEE NUM mit in den Corpus aufnehmen, aber fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013 + added_POS = ["NOUN"]#, "NUM" ]#,"VERB","ADJ"] #IDEE NUM mit in den Corpus aufnehmen, aber fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013 # append Tokens to a list for tok in spacy_doc: @@ -148,55 +141,33 @@ def cleanText(string,custom_stopwords=None, custom_symbols=None, custom_words=No tokens = [tok for tok in tokens if len(tok)>1] # remove large strings of whitespace - while "" in tokens: - tokens.remove("") - while " " in tokens: - tokens.remove(" ") - while "\n" in tokens: - tokens.remove("\n") - while "\n\n" in tokens: - tokens.remove("\n\n") - """ - tokenz = [] - for tok in tokens: - tokenz.append(str(getFirstSynonym(tok,THESAURUS_gen))) - tokens = tokenz - """ - tokens = [str(getFirstSynonym(tok,THESAURUS_gen)) for tok in tokens] + remove_large_strings_of_whitespace(" ".join(tokens)) + + + #idee abkürzungen auflösen (v.a. TU -> Technische Universität) + + if normalize_synonyms: + tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens] return " ".join(tokens) -def generateTextfromXML(path2xml, clean=True, textfield='Beschreibung'): - import xml.etree.ElementTree as ET +def remove_large_strings_of_whitespace(sentence): - tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8")) + whitespaceFinder = re.compile(r'(\r\n|\r|\n)', re.IGNORECASE) + sentence = whitespaceFinder.sub(" ", sentence) - root = tree.getroot() + tokenlist = sentence.split(" ") - for subject in root.iter(textfield): - if clean: - yield cleanText(subject.text) - else: - yield subject.text + while "" in tokenlist: + tokenlist.remove("") + while " " in tokenlist: + tokenlist.remove(" ") -def generateMetadatafromXML(path2xml, keys=["Loesung","Kategorie","Zusammenfassung"]): - import xml.etree.ElementTree as ET - - tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8")) - - root = tree.getroot() - - metadata = dict.fromkeys(keys) - - - for ticket in root.findall('ticket'): - for key in metadata: - metadata[key] = ticket.find(key).text - - yield metadata - -def generateFromXML(path2xml, clean=True, textfield='Beschreibung'): + return " ".join(tokenlist) +""" +""" +def generateFromXML(path2xml, textfield='Beschreibung', clean=False, normalize_Synonyms=False,lemmatize=False): import xml.etree.ElementTree as ET tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8")) @@ -208,58 +179,291 @@ def generateFromXML(path2xml, clean=True, textfield='Beschreibung'): for field in ticket: if field.tag == textfield: if clean: - text = cleanText(field.text) + text = cleanText_words(field.text,PARSER,normalize_synonyms=normalize_Synonyms,lemmatize=lemmatize) else: text = field.text else: + #idee hier auch cleanen? metadata[field.tag] = field.text yield text, metadata +""" +LANGUAGE = 'de' +PARSER = spacy.load(LANGUAGE) + + + +def generateTextfromXML(path2xml, textfield='Beschreibung', clean=False, normalize_Synonyms=False,lemmatize=False): + import xml.etree.ElementTree as ET + + tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8")) + + root = tree.getroot() + """ + for subject in root.iter(textfield): + if clean: + yield cleanText(subject.text) + else: + yield subject.text + """ + for ticket in root: + text = "ERROR" + for field in ticket: + if field.tag == textfield: + if clean: + text = cleanText_words(field.text,normalize_synonyms=normalize_Synonyms,lemmatize=lemmatize) + else: + text = field.text + yield text + +def generateMetadatafromXML(path2xml, textfield='Beschreibung'):#,keys_to_clean=["Loesung","Zusammenfassung"]): + import xml.etree.ElementTree as ET + + tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8")) + + root = tree.getroot() + + """ + metadata = dict.fromkeys(keys) + + for ticket in root.findall('ticket'): + for key in metadata: + metadata[key] = ticket.find(key).text + + yield metadata + """ + for ticket in root: + metadata = {} + for field in ticket: + if field.tag != textfield: + if field.tag == "Zusammenfassung": + # idee lösung nur whitespace entfernen, zusammenfassung auch von symbolen befreien + metadata[field.tag] = cleanText_symbols(field.text) + elif field.tag == "Loesung": + metadata[field.tag] = remove_whitespace(field.text) + else: + metadata[field.tag] = field.text + + yield metadata + + +def cleanText_symbols(string, parser=PARSER, custom_symbols=None, keep=None): + """ + https://spacy.io/docs/usage/pos-tagging + + cleans text from PUNCT, NUM, whitespaces, newlines, and the following list of symbols: + + ["-----","---","...","“","”",".","-","<",">",",","?","!","..","n’t","n't","|","||",";",":","…","’s","'s",".","(",")","[","]","#"] + + """ + if custom_symbols is not None: + custom_symbols = custom_symbols + else: + custom_symbols = [] + + if keep is not None: + keep = keep + else: + keep = [] + + # List of symbols we don't care about + symbols = ["-----","---","...","“","”",".","-","<",">",",","?","!","..","n’t","n't","|","||",";",":","…","’s","'s",".","(",")","[","]","#"] + custom_symbols + + # parse with spaCy + spacy_doc = parser(string) + tokens = [] + + pos = ["NUM", "SPACE", "PUNCT"] + for p in keep: + pos.remove(p) + + + # append Tokens to a list + for tok in spacy_doc: + if tok.pos_ not in pos and tok.text not in symbols: + tokens.append(tok.text) + + return " ".join(tokens) + +def cleanText_words(string,parser=PARSER, custom_stopwords=None, custom_words=None, customPreprocessing=cleanText_symbols, lemmatize=False, normalize_synonyms=False): + + # use preprocessing + if customPreprocessing is not None: + string = customPreprocessing(string) + + if custom_stopwords is not None: + custom_stopwords = custom_stopwords + else: + custom_stopwords = [] + + if custom_words is not None: + custom_words = custom_words + else: + custom_words = [] + + + # custom stoplist + # https://stackoverflow.com/questions/9806963/how-to-use-pythons-import-function-properly-import + stop_words = __import__("spacy." + parser.lang, globals(), locals(), ['object']).STOP_WORDS + + stoplist =list(stop_words) + custom_stopwords + + # replace twitter + mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE) + string = mentionFinder.sub("MENTION", string) + + # replace emails + emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE) + string = emailFinder.sub("EMAIL", string) + + # replace urls + urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE) + string = urlFinder.sub("URL", string) + + # replace HTML symbols + string = string.replace("&", "and").replace(">", ">").replace("<", "<") + + + + # parse with spaCy + spacy_doc = parser(string) + tokens = [] + + added_entities = ["WORK_OF_ART","ORG","PRODUCT", "LOC"]#,"PERSON"] + added_POS = ["NOUN"]#, "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013 + + # append Tokens to a list + for tok in spacy_doc: + if tok.pos_ in added_POS: + if lemmatize: + tokens.append(tok.lemma_.lower().strip()) + else: + tokens.append(tok.text.lower().strip()) + + # add entities + if tok.ent_type_ in added_entities: + tokens.append(tok.text.lower()) + + + + # remove stopwords + tokens = [tok for tok in tokens if tok not in stoplist] + + # remove custom_words + tokens = [tok for tok in tokens if tok not in custom_words] + + # remove single characters + tokens = [tok for tok in tokens if len(tok)>1] + + # remove large strings of whitespace + #remove_whitespace(" ".join(tokens)) + + + #idee abkürzungen auflösen (v.a. TU -> Technische Universität) + + if normalize_synonyms: + tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens] + + return " ".join(set(tokens)) + +def remove_whitespace(sentence): + whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE) + sentence = whitespaceFinder.sub(" ", sentence) + return sentence + +def getFirstSynonym(word, thesaurus_gen): + + word = word.lower() + + + # durch den thesaurrus iterieren + for syn_block in thesaurus_gen: # syn_block ist eine liste mit Synonymen + + for syn in syn_block: + syn = syn.lower() + if re.match(r'\A[\w-]+\Z', syn): # falls syn einzelwort ist + if word == syn: + return getHauptform(syn_block, word) + else: # falls es ein satz ist + if word in syn: + return getHauptform(syn_block, word) + return word # zur Not, das ursrpüngliche Wort zurückgeben + +def getHauptform(syn_block, word, default_return_first_Syn=False): + + for syn in syn_block: + syn = syn.lower() + + if "hauptform" in syn and len(syn.split(" ")) <= 2: + # nicht ausgeben, falls es in Klammern steht + for w in syn.split(" "): + if not re.match(r'\([^)]+\)', w): + return w + + if default_return_first_Syn: + # falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht + for w in syn_block: + if not re.match(r'\([^)]+\)', w): + return w + return word # zur Not, das ursrpüngliche Wort zurückgeben + + +def printRandomDoc(textacyCorpus): + print() + + print("len(textacyCorpus) = %i" % len(textacyCorpus)) + randIndex = int((len(textacyCorpus) - 1) * random.random()) + print("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata)) + + print() + ####################'####################'####################'####################'####################'############## - +import de_core_news_md DATAPATH = "ticketSamples.xml" DATAPATH_thesaurus = "openthesaurus.csv" -LANGUAGE = 'de' +normalize_Synonyms = True +clean = True +lemmatize = True + +custom_words = ["grüßen", "fragen"] + ####################'####################'####################'####################'####################'############## -PARSER = spacy.load(LANGUAGE) -THESAURUS_gen = textacy.fileio.read_csv(DATAPATH_thesaurus, delimiter=";") # generator [[a,b,c,..],[a,b,c,..],...] +#PARSER = de_core_news_md.load() + +THESAURUS_list=list(textacy.fileio.read_csv(DATAPATH_thesaurus, delimiter=";")) ## !!!!!! list wichtig, da sonst nicht die gleichen Synonyme zurückgegeben werden, weil der generator während der laufzeit pickt + + ## files to textacy-corpus textacyCorpus = textacy.Corpus(PARSER) print("add texts to textacy-corpus...") -#textacyCorpus.add_texts(texts=generateTextfromXML(DATAPATH), metadatas=generateMetadatafromXML(DATAPATH)) -for txt, dic in generateFromXML(DATAPATH): - textacyCorpus.add_text(txt,dic) +textacyCorpus.add_texts(texts=generateTextfromXML(DATAPATH,normalize_Synonyms=normalize_Synonyms, clean=clean, lemmatize=lemmatize), metadatas=generateMetadatafromXML(DATAPATH)) + + +#for txt, dic in generateFromXML(DATAPATH, normalize_Synonyms=normalize_Synonyms, clean=clean, lemmatize=lemmatize): +# textacyCorpus.add_text(txt,dic) +for doc in textacyCorpus: + print(doc.metadata) + print(doc.text) - -print(textacyCorpus[2].text) +#print(textacyCorpus[2].text) #printRandomDoc(textacyCorpus) #print(textacyCorpus[len(textacyCorpus)-1].text) - - - - - - - - - - - - +print() +print() diff --git a/test.py b/test.py index ec4a3db..201e1c9 100644 --- a/test.py +++ b/test.py @@ -8,90 +8,146 @@ import xml.etree.ElementTree as ET DATAPATH_thesaurus = "openthesaurus.csv" -def generateFromXML(path2xml, clean=True, textfield='Beschreibung'): - import xml.etree.ElementTree as ET - tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8")) - root = tree.getroot() +PARSER = spacy.load('de') - for ticket in root: - metadata = {} - text = "ERROR" - for field in ticket: - if field.tag == textfield: - if clean: - text = (field.text) + + +def cleanText_symbols(string, parser=PARSER, custom_symbols=None, keep=None): + """ + https://spacy.io/docs/usage/pos-tagging + + cleans text from PUNCT, NUM, whitespaces, newlines, and the following list of symbols: + + ["-----","---","...","“","”",".","-","<",">",",","?","!","..","n’t","n't","|","||",";",":","…","’s","'s",".","(",")","[","]","#"] + + """ + if custom_symbols is not None: + custom_symbols = custom_symbols + else: + custom_symbols = [] + + if keep is not None: + keep = keep + else: + keep = [] + + # List of symbols we don't care about + symbols = ["-----","---","...","“","”",".","-","<",">",",","?","!","..","n’t","n't","|","||",";",":","…","’s","'s",".","(",")","[","]","#"] + custom_symbols + + # parse with spaCy + spacy_doc = parser(string) + tokens = [] + + pos = ["NUM", "SPACE", "PUNCT"] + for p in keep: + pos.remove(p) + + + # append Tokens to a list + for tok in spacy_doc: + if tok.pos_ not in pos: + tokens.append(tok.text.lower().strip()) + + + # remove symbols + tokens = [tok for tok in tokens if tok not in symbols] + + # remove whitespace + remove_whitespace(" ".join(tokens)) + + return " ".join(tokens) + +def cleanText_words(string, parser=PARSER, custom_stopwords=None, custom_words=None, customPreprocessing=cleanText_symbols, lemmatize=False, normalize_synonyms=False): + + # use preprocessing + if customPreprocessing is not None: + string = customPreprocessing(string) + + if custom_stopwords is not None: + custom_stopwords = custom_stopwords + else: + custom_stopwords = [] + + if custom_words is not None: + custom_words = custom_words + else: + custom_words = [] + + + # custom stoplist + # https://stackoverflow.com/questions/9806963/how-to-use-pythons-import-function-properly-import + stop_words = __import__("spacy." + parser.lang, globals(), locals(), ['object']).STOP_WORDS + + stoplist =list(stop_words) + custom_stopwords + + # replace twitter + mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE) + string = mentionFinder.sub("MENTION", string) + + # replace emails + emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE) + string = emailFinder.sub("EMAIL", string) + + # replace urls + urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE) + string = urlFinder.sub("URL", string) + + # replace HTML symbols + string = string.replace("&", "and").replace(">", ">").replace("<", "<") + + + + # parse with spaCy + spacy_doc = parser(string) + tokens = [] + + added_entities = ["WORK_OF_ART","ORG","PRODUCT", "LOC"]#,"PERSON"] + added_POS = ["NOUN"]#, "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013 + + # append Tokens to a list + for tok in spacy_doc: + if tok.pos_ in added_POS: + if lemmatize: + tokens.append(tok.lemma_.lower().strip()) else: - text = field.text - else: - metadata[field.tag] = field.text - yield text, metadata + tokens.append(tok.text.lower().strip()) - -def getFirstSynonym(word, thesaurus_gen): - - word = word.lower() - # TODO word cleaning https://stackoverflow.com/questions/3939361/remove-specific-characters-from-a-string-in-python - - - # durch den thesaurrus iterieren - for syn_block in thesaurus_gen: # syn_block ist eine liste mit Synonymen - - # durch den synonymblock iterieren - for syn in syn_block: - syn = syn.lower().split(" ") if not re.match(r'\A[\w-]+\Z', syn) else syn # aus synonym mach liste (um evtl. sätze zu identifieziren) - - # falls das wort in dem synonym enthalten ist (also == einem Wort in der liste ist) - if word in syn: - - # Hauptform suchen - if "Hauptform" in syn: - # nicht ausgeben, falls es in Klammern steht - for w in syn: - if not re.match(r'\([^)]+\)', w) and w is not None: - return w - - # falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht - if len(syn) == 1: - w = syn[0] - if not re.match(r'\([^)]+\)', w) and w is not None: - return w - - return word # zur Not die eingabe ausgeben + # add entities + if tok.ent_type_ in added_entities: + tokens.append(tok.text.lower()) -def getFirstSynonym(word, thesaurus_gen): + # remove stopwords + tokens = [tok for tok in tokens if tok not in stoplist] - word = word.lower() - # TODO word cleaning https://stackoverflow.com/questions/3939361/remove-specific-characters-from-a-string-in-python + # remove custom_words + tokens = [tok for tok in tokens if tok not in custom_words] + + # remove single characters + tokens = [tok for tok in tokens if len(tok)>1] + + # remove large strings of whitespace + #remove_whitespace(" ".join(tokens)) - # durch den thesaurrus iterieren - for syn_block in thesaurus_gen: # syn_block ist eine liste mit Synonymen + #idee abkürzungen auflösen (v.a. TU -> Technische Universität) - for syn in syn_block: + #if normalize_synonyms: + # tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens] - if re.match(r'\A[\w-]+\Z', syn): #falls syn einzelwort ist - if word == syn: - getHauptform(syn_block) + return " ".join(tokens) +def remove_whitespace(sentence): + whitespaceFinder = re.compile(r'(\r\n|\r|\n|\s)', re.IGNORECASE) + sentence = whitespaceFinder.sub(" ", sentence) + return sentence - - -def getHauptform(syn_block): - for s in syn_block: - if "Hauptform" in s: - # nicht ausgeben, falls es in Klammern steht - for w in s: - if not re.match(r'\([^)]+\)', w) and w is not None: - return w - - # falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht - if len(s) == 1: - w = s[0] - if not re.match(r'\([^)]+\)', w) and w is not None: - return w +def cleanText_normalize(string, parser=PARSER, customPreprocessing=cleanText_words, lemmatize=True): + # use preprocessing + if customPreprocessing is not None: + string = customPreprocessing(string) @@ -101,25 +157,9 @@ def getHauptform(syn_block): +string = "Frau Hinrichs überdenkt die Situation und 545453 macht dann neue Anträge. \n Dieses Ticket wird geschlossen \n \n test" +print(cleanText_symbols(string=string, parser=PARSER, keep=["NUM"])) - - - - - - - - -strings = ["passwort",""] -THESAURUS_gen = textacy.fileio.read_csv(DATAPATH_thesaurus, delimiter=";") # generator [[a,b,c,..],[a,b,c,..],...] - -for s in strings: - print(getFirstSynonym(s,THESAURUS_gen)) - - - - - - - +string = "Frau Hinrichs überdenkt die Situation und 545453 macht dann neue Anträge. \n Dieses Ticket wird geschlossen \n \n test" +print(cleanText_symbols(string=string, parser=PARSER, keep=None)) From 11e77fad06427eec1d19d9d98fe264d77ecac5c8 Mon Sep 17 00:00:00 2001 From: "jannis.grundmann" Date: Fri, 1 Sep 2017 14:27:03 +0200 Subject: [PATCH 2/9] textcleaning refactored --- preprocessing.py | 61 +++++------- test.py | 165 ------------------------------- textCleaning.py | 245 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 267 insertions(+), 204 deletions(-) delete mode 100644 test.py create mode 100644 textCleaning.py diff --git a/preprocessing.py b/preprocessing.py index 536f426..f33836a 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -190,73 +190,57 @@ def generateFromXML(path2xml, textfield='Beschreibung', clean=False, normalize_ LANGUAGE = 'de' +#PARSER = de_core_news_md.load() PARSER = spacy.load(LANGUAGE) +from textCleaning import TextCleaner + +cleaner = TextCleaner(parser=PARSER) -def generateTextfromXML(path2xml, textfield='Beschreibung', clean=False, normalize_Synonyms=False,lemmatize=False): +def generateTextfromTicketXML(path2xml, textfield='Beschreibung', clean=False, normalize_Synonyms=False, lemmatize=False): import xml.etree.ElementTree as ET tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8")) - root = tree.getroot() - """ - for subject in root.iter(textfield): - if clean: - yield cleanText(subject.text) - else: - yield subject.text - """ + + for ticket in root: text = "ERROR" for field in ticket: if field.tag == textfield: if clean: - text = cleanText_words(field.text,normalize_synonyms=normalize_Synonyms,lemmatize=lemmatize) + text = cleaner.normalizeSynonyms(cleaner.removeWords(cleaner.keepPOSandENT(field.text))) #,normalize_synonyms=normalize_Synonyms,lemmatize=lemmatize) else: text = field.text yield text -def generateMetadatafromXML(path2xml, textfield='Beschreibung'):#,keys_to_clean=["Loesung","Zusammenfassung"]): +def generateMetadatafromTicketXML(path2xml, textfield='Beschreibung'):#,keys_to_clean=["Loesung","Zusammenfassung"]): import xml.etree.ElementTree as ET tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8")) root = tree.getroot() - """ - metadata = dict.fromkeys(keys) - - for ticket in root.findall('ticket'): - for key in metadata: - metadata[key] = ticket.find(key).text - - yield metadata - """ for ticket in root: metadata = {} for field in ticket: if field.tag != textfield: if field.tag == "Zusammenfassung": - # idee lösung nur whitespace entfernen, zusammenfassung auch von symbolen befreien - metadata[field.tag] = cleanText_symbols(field.text) + metadata[field.tag] = cleaner.removePunctuation(field.text) elif field.tag == "Loesung": - metadata[field.tag] = remove_whitespace(field.text) + metadata[field.tag] = cleaner.removeWhitespace(field.text) else: metadata[field.tag] = field.text yield metadata + + +""" def cleanText_symbols(string, parser=PARSER, custom_symbols=None, keep=None): - """ - https://spacy.io/docs/usage/pos-tagging - cleans text from PUNCT, NUM, whitespaces, newlines, and the following list of symbols: - - ["-----","---","...","“","”",".","-","<",">",",","?","!","..","n’t","n't","|","||",";",":","…","’s","'s",".","(",")","[","]","#"] - - """ if custom_symbols is not None: custom_symbols = custom_symbols else: @@ -360,18 +344,21 @@ def cleanText_words(string,parser=PARSER, custom_stopwords=None, custom_words=No #remove_whitespace(" ".join(tokens)) - #idee abkürzungen auflösen (v.a. TU -> Technische Universität) + #idee abkürzungen auflösen (v.a. TU -> Technische Universität): abkürzungsverezeichnis if normalize_synonyms: tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens] return " ".join(set(tokens)) -def remove_whitespace(sentence): +def cleanText_removeWhitespace(sentence): whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE) sentence = whitespaceFinder.sub(" ", sentence) return sentence +#todo: preprocess pipe: removewhitespace, removePUNCT, resolveAbk, keepPOS, keepEnt, removeWords, normalizeSynonyms + + def getFirstSynonym(word, thesaurus_gen): word = word.lower() @@ -407,7 +394,7 @@ def getHauptform(syn_block, word, default_return_first_Syn=False): if not re.match(r'\([^)]+\)', w): return w return word # zur Not, das ursrpüngliche Wort zurückgeben - +""" def printRandomDoc(textacyCorpus): print() @@ -434,18 +421,14 @@ custom_words = ["grüßen", "fragen"] ####################'####################'####################'####################'####################'############## -#PARSER = de_core_news_md.load() - -THESAURUS_list=list(textacy.fileio.read_csv(DATAPATH_thesaurus, delimiter=";")) ## !!!!!! list wichtig, da sonst nicht die gleichen Synonyme zurückgegeben werden, weil der generator während der laufzeit pickt - - +#todo joar diese pipe halt und vllt ne config-file ## files to textacy-corpus textacyCorpus = textacy.Corpus(PARSER) print("add texts to textacy-corpus...") -textacyCorpus.add_texts(texts=generateTextfromXML(DATAPATH,normalize_Synonyms=normalize_Synonyms, clean=clean, lemmatize=lemmatize), metadatas=generateMetadatafromXML(DATAPATH)) +textacyCorpus.add_texts(texts=generateTextfromTicketXML(DATAPATH, normalize_Synonyms=normalize_Synonyms, clean=clean, lemmatize=lemmatize), metadatas=generateMetadatafromTicketXML(DATAPATH)) #for txt, dic in generateFromXML(DATAPATH, normalize_Synonyms=normalize_Synonyms, clean=clean, lemmatize=lemmatize): diff --git a/test.py b/test.py deleted file mode 100644 index 201e1c9..0000000 --- a/test.py +++ /dev/null @@ -1,165 +0,0 @@ -# -*- coding: utf-8 -*- -import re - -import spacy -import textacy -import xml.etree.ElementTree as ET - - -DATAPATH_thesaurus = "openthesaurus.csv" - - -PARSER = spacy.load('de') - - - -def cleanText_symbols(string, parser=PARSER, custom_symbols=None, keep=None): - """ - https://spacy.io/docs/usage/pos-tagging - - cleans text from PUNCT, NUM, whitespaces, newlines, and the following list of symbols: - - ["-----","---","...","“","”",".","-","<",">",",","?","!","..","n’t","n't","|","||",";",":","…","’s","'s",".","(",")","[","]","#"] - - """ - if custom_symbols is not None: - custom_symbols = custom_symbols - else: - custom_symbols = [] - - if keep is not None: - keep = keep - else: - keep = [] - - # List of symbols we don't care about - symbols = ["-----","---","...","“","”",".","-","<",">",",","?","!","..","n’t","n't","|","||",";",":","…","’s","'s",".","(",")","[","]","#"] + custom_symbols - - # parse with spaCy - spacy_doc = parser(string) - tokens = [] - - pos = ["NUM", "SPACE", "PUNCT"] - for p in keep: - pos.remove(p) - - - # append Tokens to a list - for tok in spacy_doc: - if tok.pos_ not in pos: - tokens.append(tok.text.lower().strip()) - - - # remove symbols - tokens = [tok for tok in tokens if tok not in symbols] - - # remove whitespace - remove_whitespace(" ".join(tokens)) - - return " ".join(tokens) - -def cleanText_words(string, parser=PARSER, custom_stopwords=None, custom_words=None, customPreprocessing=cleanText_symbols, lemmatize=False, normalize_synonyms=False): - - # use preprocessing - if customPreprocessing is not None: - string = customPreprocessing(string) - - if custom_stopwords is not None: - custom_stopwords = custom_stopwords - else: - custom_stopwords = [] - - if custom_words is not None: - custom_words = custom_words - else: - custom_words = [] - - - # custom stoplist - # https://stackoverflow.com/questions/9806963/how-to-use-pythons-import-function-properly-import - stop_words = __import__("spacy." + parser.lang, globals(), locals(), ['object']).STOP_WORDS - - stoplist =list(stop_words) + custom_stopwords - - # replace twitter - mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE) - string = mentionFinder.sub("MENTION", string) - - # replace emails - emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE) - string = emailFinder.sub("EMAIL", string) - - # replace urls - urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE) - string = urlFinder.sub("URL", string) - - # replace HTML symbols - string = string.replace("&", "and").replace(">", ">").replace("<", "<") - - - - # parse with spaCy - spacy_doc = parser(string) - tokens = [] - - added_entities = ["WORK_OF_ART","ORG","PRODUCT", "LOC"]#,"PERSON"] - added_POS = ["NOUN"]#, "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013 - - # append Tokens to a list - for tok in spacy_doc: - if tok.pos_ in added_POS: - if lemmatize: - tokens.append(tok.lemma_.lower().strip()) - else: - tokens.append(tok.text.lower().strip()) - - # add entities - if tok.ent_type_ in added_entities: - tokens.append(tok.text.lower()) - - - - # remove stopwords - tokens = [tok for tok in tokens if tok not in stoplist] - - # remove custom_words - tokens = [tok for tok in tokens if tok not in custom_words] - - # remove single characters - tokens = [tok for tok in tokens if len(tok)>1] - - # remove large strings of whitespace - #remove_whitespace(" ".join(tokens)) - - - #idee abkürzungen auflösen (v.a. TU -> Technische Universität) - - #if normalize_synonyms: - # tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens] - - return " ".join(tokens) - -def remove_whitespace(sentence): - whitespaceFinder = re.compile(r'(\r\n|\r|\n|\s)', re.IGNORECASE) - sentence = whitespaceFinder.sub(" ", sentence) - return sentence - -def cleanText_normalize(string, parser=PARSER, customPreprocessing=cleanText_words, lemmatize=True): - # use preprocessing - if customPreprocessing is not None: - string = customPreprocessing(string) - - - - - - - - - -string = "Frau Hinrichs überdenkt die Situation und 545453 macht dann neue Anträge. \n Dieses Ticket wird geschlossen \n \n test" -print(cleanText_symbols(string=string, parser=PARSER, keep=["NUM"])) - - -string = "Frau Hinrichs überdenkt die Situation und 545453 macht dann neue Anträge. \n Dieses Ticket wird geschlossen \n \n test" -print(cleanText_symbols(string=string, parser=PARSER, keep=None)) diff --git a/textCleaning.py b/textCleaning.py new file mode 100644 index 0000000..a014728 --- /dev/null +++ b/textCleaning.py @@ -0,0 +1,245 @@ +# -*- coding: utf-8 -*- +import re +import spacy +import functools + +import textacy + + +class TextCleaner: + + def __init__(self, parser, thesaurus=None, customClass_symbols=None, customClass_words=None, keep4Class=None): + """ + :param parser: spacy-parser + :param thesaurus: [[syn1, syn2, ...],[syn1, syn2, ...], ...] + :param customClass_symbols:[str] + :param customClass_words:[str] + :param customClassPOS:[str] + :param keep4Class: [str] + """ + if thesaurus is None: + DATAPATH_thesaurus = "openthesaurus.csv" + + ## !!!!!! list wichtig, da sonst nicht die gleichen Synonyme zurückgegeben werden, weil ein generator während der laufzeit pickt + self.thesaurus = list(textacy.fileio.read_csv(DATAPATH_thesaurus, delimiter=";")) + else: + self.thesaurus = thesaurus + + self.parser = parser + + + + self.whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE) + self.mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE) + self.emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE) + self.urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE) + + + + # to remove + self.symbols = ["-----", "---", "...", "“", "”", ".", "-", "<", ">", ",", "?", "!", "..", "n’t", "n't", "|", "||", + ";", ":", + "…", "’s", "'s", ".", "(", ")", "[", "]", "#"] + (customClass_symbols if customClass_symbols is not None else []) + self.stop_words = list(__import__("spacy." + self.parser.lang, globals(), locals(), ['object']).STOP_WORDS)+ (customClass_words if customClass_words is not None else []) + + + + # to keep + self.entities2keep = ["WORK_OF_ART", "ORG", "PRODUCT", "LOC"] # ,"PERSON"] + self.pos2keep = ["NOUN"] # , "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013 + + self.entities2keep = self.entities2keep + (keep4Class if keep4Class is not None else []) + self.pos2keep = self.pos2keep + (keep4Class if keep4Class is not None else []) + + + keep = (keep4Class if hasattr(keep4Class, '__iter__') else []) + self.pos2keep + self.entities2keep + + + # modify those to remove with those to keep + for sym in keep: + try: + self.symbols.remove(sym) + except ValueError: + try: + self.stop_words.remove(sym) + except ValueError: + pass + + + # idee self.currentDoc = spacy.Doc für jeden String aber nicht füpr jede methode + + def removeWhitespace(self, string): + string = self.whitespaceFinder.sub(" ", string) + return string + + + def removePunctuation(self, string, custom_symbols=None, keep=None): + + + symbols = self.symbols + (custom_symbols if custom_symbols is not None else []) + + if hasattr(keep, '__iter__'): + for k in keep: + try: + symbols.remove(k) + except ValueError: + pass + + + # parse with spaCy + doc = self.parser(string) + tokens = [] + + # append Tokens to a list + for tok in doc: + if not tok.is_punct and not tok.is_space and tok.text not in symbols: + tokens.append(tok.text) + + return " ".join(tokens) + + def resolveAbbreviations(self,string): + return string #todo + + + def keepPOSandENT(self, string, customPOS=None, customEnt=None, remove=None): + + pos2keep = self.pos2keep + (customPOS if customPOS is not None else []) + ent = self.entities2keep + (customEnt if customEnt is not None else []) + + if hasattr(remove, '__iter__'): + for k in remove: + try: + ent.remove(k) + except ValueError: + try: + pos2keep.remove(k) + except ValueError: + pass + + # parse with spaCy + spacy_doc = self.parser(string) + tokens = [] + + # append Tokens to a list + for tok in spacy_doc: + + if tok.pos_ in pos2keep: + tokens.append(tok.text) + + if tok.ent_type_ in ent: + tokens.append(tok.text) + + return " ".join(set(tokens)) + + + + def removeWords(self,string, custom_words=None, keep=None, lemmatize=False): + + wordlist = self.stop_words + (custom_words if custom_words is not None else []) + if hasattr(keep, '__iter__'): + for k in keep: + try: + wordlist.remove(k) + except ValueError: + pass + + + + string = self.urlFinder.sub("URL", string) + string = self.emailFinder.sub("EMAIL", string) + string = self.mentionFinder.sub("MENTION", string) + string = string.replace("&", "and").replace(">", ">").replace("<", "<") + + + # parse with spaCy + spacy_doc = self.parser(string) + tokens = [] + + # append Tokens to a list + for tok in spacy_doc: + + #do not include stopwords/customwords and single chars + if tok.text not in wordlist and len(tok)>1: + if lemmatize: + tokens.append(tok.lemma_) + else: + tokens.append(tok.lower_) + return " ".join(set(tokens)) + + + + + + + + def normalizeSynonyms(self, string, default_return_first_Syn=False): + # parse with spaCy + spacy_doc = self.parser(string) + tokens = [] + + tokens = [str(self.getFirstSynonym(tok, self.thesaurus, default_return_first_Syn=default_return_first_Syn)) for tok in spacy_doc] + + return " ".join(set(tokens)) + + + + def getFirstSynonym(self,word, thesaurus, default_return_first_Syn=False): + if not isinstance(word, str): + return word + + + word = word.lower() + + + # durch den thesaurrus iterieren + for syn_block in thesaurus: # syn_block ist eine liste mit Synonymen + + for syn in syn_block: + syn = syn.lower() + if re.match(r'\A[\w-]+\Z', syn): # falls syn einzelwort ist + if word == syn: + return self.getHauptform(syn_block, word, default_return_first_Syn=default_return_first_Syn) + else: # falls es ein satz ist + if word in syn: + return self.getHauptform(syn_block, word, default_return_first_Syn=default_return_first_Syn) + return word # zur Not, das ursrpüngliche Wort zurückgeben + + def getHauptform(self,syn_block, word, default_return_first_Syn=False): + + for syn in syn_block: + syn = syn.lower() + + if "hauptform" in syn and len(syn.split(" ")) <= 2: + # nicht ausgeben, falls es in Klammern steht + for w in syn.split(" "): + if not re.match(r'\([^)]+\)', w): + return w + + if default_return_first_Syn: + # falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht + for w in syn_block: + if not re.match(r'\([^)]+\)', w): + return w + return word # zur Not, das ursrpüngliche Wort zurückgeben + + + + +cleaner = TextCleaner(parser=spacy.load('de')) + +string = "Frau Hinrichs überdenkt die tu Situation und 545453 macht ' dann neue Anträge. \n Dieses Ticket wird geschlossen \n \n test" + + +################################################################################################################# + +#todo funzt irgendwie nich wie's soll: https://mathieularose.com/function-composition-in-python/ +def compose(self,*functions): + return functools.reduce(lambda f, g: lambda x: f(g(x)), functions, lambda x: x) + +pipeline = compose(functools.partial(cleaner.keepPOSandENT,lemmatize=True))#, cleaner.normalizeSynonyms) + +################################################################################################################# +print(cleaner.removePunctuation(string)) +print(cleaner.keepPOSandENT(string)) + + From 05b4f514d5323426e28bcdbbbef75556abbc2676 Mon Sep 17 00:00:00 2001 From: "jannis.grundmann" Date: Tue, 5 Sep 2017 11:52:39 +0200 Subject: [PATCH 3/9] spacy-pipeline / python funciton-composing versucht --- preprocessing.py | 4 +- test.py | 124 +++++++++++++++++++++++++++++++++++++++++++++++ textCleaning.py | 57 +++++++++++++++------- 3 files changed, 166 insertions(+), 19 deletions(-) create mode 100644 test.py diff --git a/preprocessing.py b/preprocessing.py index f33836a..9fb59fd 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -406,6 +406,7 @@ def printRandomDoc(textacyCorpus): print() ####################'####################'####################'####################'####################'############## +# todo config-file import de_core_news_md DATAPATH = "ticketSamples.xml" @@ -421,8 +422,7 @@ custom_words = ["grüßen", "fragen"] ####################'####################'####################'####################'####################'############## -#todo joar diese pipe halt und vllt ne config-file - +#todo https://spacy.io/docs/usage/customizing-pipeline ## files to textacy-corpus textacyCorpus = textacy.Corpus(PARSER) diff --git a/test.py b/test.py new file mode 100644 index 0000000..08db3a2 --- /dev/null +++ b/test.py @@ -0,0 +1,124 @@ +# -*- coding: utf-8 -*- +import spacy +import textacy +from spacy.tokens import Doc + +# -*- coding: utf-8 -*- +import re +import spacy +import functools + +import textacy + + +class TextCleaner: + + def __init__(self, parser, thesaurus=None, customClass_symbols=None, customClass_words=None, keep4All=None): + """ + :param parser: spacy-parser + :param thesaurus: [[syn1, syn2, ...],[syn1, syn2, ...], ...] + :param customClass_symbols:[str] + :param customClass_words:[str] + :param customClassPOS:[str] + :param keep4All: [str] + """ + if thesaurus is None: + DATAPATH_thesaurus = "openthesaurus.csv" + + ## !!!!!! list wichtig, da sonst nicht die gleichen Synonyme zurückgegeben werden, weil ein generator während der laufzeit pickt + self.thesaurus = list(textacy.fileio.read_csv(DATAPATH_thesaurus, delimiter=";")) + else: + self.thesaurus = thesaurus + + self.parser = parser + + + + self.whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE) + self.mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE) + self.emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE) + self.urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE) + + + + # to remove + self.symbols = ["-----", "---", "...", "“", "”", ".", "-", "<", ">", ",", "?", "!", "..", "n’t", "n't", "|", "||", + ";", ":", + "…", "’s", "'s", ".", "(", ")", "[", "]", "#"] + (customClass_symbols if customClass_symbols is not None else []) + self.stop_words = list(__import__("spacy." + self.parser.lang, globals(), locals(), ['object']).STOP_WORDS)+ (customClass_words if customClass_words is not None else []) + + + + # to keep + self.entities2keep = ["WORK_OF_ART", "ORG", "PRODUCT", "LOC"] # ,"PERSON"] + self.pos2keep = ["NOUN"] # , "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013 + + self.entities2keep = self.entities2keep + (keep4All if keep4All is not None else []) + self.pos2keep = self.pos2keep + (keep4All if keep4All is not None else []) + + + keep = (keep4All if hasattr(keep4All, '__iter__') else []) + self.pos2keep + self.entities2keep + + + # modify those to remove with those to keep + for sym in keep: + try: + self.symbols.remove(sym) + except ValueError: + pass + for sym in keep: + try: + self.stop_words.remove(sym) + except ValueError: + pass + + + def loadString(self,string): + self.currentDoc = self.parser(string) + + + def removeWhitespace(self, string): + return " ".join([tok.text for tok in self.currentDoc if not tok.is_space]) + + + def removePunctuation(self, string, custom_symbols=None, keep=None): + symbols = self.symbols + (custom_symbols if custom_symbols is not None else []) + if hasattr(keep, '__iter__'): + for k in keep: + try: + symbols.remove(k) + except ValueError: + pass + + return " ".join([tok.text for tok in self.currentDoc if not tok.is_punct and tok.text not in symbols]) + + +#todo funzt irgendwie nich wie's soll: https://mathieularose.com/function-composition-in-python/ +parser = spacy.load('de') +cleaner = TextCleaner(parser) +corpus = textacy.Corpus(parser) + + +def compose(self,*functions): + return functools.reduce(lambda f, g: lambda x: f(g(x)), functions, lambda x: x) + +def composeo(*functions): + return functools.reduce(lambda f, g: lambda x: f(g(x)), functions) + +#pipeline = compose(functools.partial(removeWhitespace,lemmatize=True))#, cleaner.normalizeSynonyms) + +pipeline = composeo(cleaner.removePunctuation, cleaner.removeWhitespace, cleaner.loadString) + +def pipe1(string): + cleaner.loadString(string) + string = cleaner.removeWhitespace(string) + string = cleaner.removePunctuation(string) + return string + + +string = "Frau Hinrichs überdenkt die tu Situation und 545453 macht ' dann neue Anträge. \n Dieses Ticket wird geschlossen \n \n test" +print(pipe1(string)) +corpus.add_text(pipeline(string)) + +print(corpus[0].text) + diff --git a/textCleaning.py b/textCleaning.py index a014728..ef6a819 100644 --- a/textCleaning.py +++ b/textCleaning.py @@ -8,14 +8,14 @@ import textacy class TextCleaner: - def __init__(self, parser, thesaurus=None, customClass_symbols=None, customClass_words=None, keep4Class=None): + def __init__(self, parser, thesaurus=None, customClass_symbols=None, customClass_words=None, keep4All=None): """ :param parser: spacy-parser :param thesaurus: [[syn1, syn2, ...],[syn1, syn2, ...], ...] :param customClass_symbols:[str] :param customClass_words:[str] :param customClassPOS:[str] - :param keep4Class: [str] + :param keep4All: [str] """ if thesaurus is None: DATAPATH_thesaurus = "openthesaurus.csv" @@ -48,11 +48,11 @@ class TextCleaner: self.entities2keep = ["WORK_OF_ART", "ORG", "PRODUCT", "LOC"] # ,"PERSON"] self.pos2keep = ["NOUN"] # , "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013 - self.entities2keep = self.entities2keep + (keep4Class if keep4Class is not None else []) - self.pos2keep = self.pos2keep + (keep4Class if keep4Class is not None else []) + self.entities2keep = self.entities2keep + (keep4All if keep4All is not None else []) + self.pos2keep = self.pos2keep + (keep4All if keep4All is not None else []) - keep = (keep4Class if hasattr(keep4Class, '__iter__') else []) + self.pos2keep + self.entities2keep + keep = (keep4All if hasattr(keep4All, '__iter__') else []) + self.pos2keep + self.entities2keep # modify those to remove with those to keep @@ -60,18 +60,25 @@ class TextCleaner: try: self.symbols.remove(sym) except ValueError: - try: - self.stop_words.remove(sym) - except ValueError: - pass + pass + for sym in keep: + try: + self.stop_words.remove(sym) + except ValueError: + pass # idee self.currentDoc = spacy.Doc für jeden String aber nicht füpr jede methode + def loadString(self,string): + self.currentDoc = self.parser(string) + """ def removeWhitespace(self, string): string = self.whitespaceFinder.sub(" ", string) return string - + """ + def removeWhitespace(self, string): + return string def removePunctuation(self, string, custom_symbols=None, keep=None): @@ -225,11 +232,7 @@ class TextCleaner: -cleaner = TextCleaner(parser=spacy.load('de')) - -string = "Frau Hinrichs überdenkt die tu Situation und 545453 macht ' dann neue Anträge. \n Dieses Ticket wird geschlossen \n \n test" - - +""" ################################################################################################################# #todo funzt irgendwie nich wie's soll: https://mathieularose.com/function-composition-in-python/ @@ -239,7 +242,27 @@ def compose(self,*functions): pipeline = compose(functools.partial(cleaner.keepPOSandENT,lemmatize=True))#, cleaner.normalizeSynonyms) ################################################################################################################# -print(cleaner.removePunctuation(string)) -print(cleaner.keepPOSandENT(string)) +""" + + + + + + + + + + + + + + + + + + + + + From 73a13551c67f54834240cc686ad03f083afddb1c Mon Sep 17 00:00:00 2001 From: "jannis.grundmann" Date: Wed, 6 Sep 2017 15:51:14 +0200 Subject: [PATCH 4/9] composing geht irgendwie aber mehr probleme --- preprocessing.py | 1 - test.py | 123 ++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 106 insertions(+), 18 deletions(-) diff --git a/preprocessing.py b/preprocessing.py index 9fb59fd..89b6317 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -422,7 +422,6 @@ custom_words = ["grüßen", "fragen"] ####################'####################'####################'####################'####################'############## -#todo https://spacy.io/docs/usage/customizing-pipeline ## files to textacy-corpus textacyCorpus = textacy.Corpus(PARSER) diff --git a/test.py b/test.py index 08db3a2..9560698 100644 --- a/test.py +++ b/test.py @@ -32,15 +32,17 @@ class TextCleaner: self.parser = parser - - - self.whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE) + #self.whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE) self.mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE) self.emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE) self.urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE) + # to keep + self.entities2keep = ["WORK_OF_ART", "ORG", "PRODUCT", "LOC"] # ,"PERSON"] + self.pos2keep = ["NOUN"] # , "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013 - + """ + # to remove self.symbols = ["-----", "---", "...", "“", "”", ".", "-", "<", ">", ",", "?", "!", "..", "n’t", "n't", "|", "||", ";", ":", @@ -48,11 +50,6 @@ class TextCleaner: self.stop_words = list(__import__("spacy." + self.parser.lang, globals(), locals(), ['object']).STOP_WORDS)+ (customClass_words if customClass_words is not None else []) - - # to keep - self.entities2keep = ["WORK_OF_ART", "ORG", "PRODUCT", "LOC"] # ,"PERSON"] - self.pos2keep = ["NOUN"] # , "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013 - self.entities2keep = self.entities2keep + (keep4All if keep4All is not None else []) self.pos2keep = self.pos2keep + (keep4All if keep4All is not None else []) @@ -71,7 +68,7 @@ class TextCleaner: self.stop_words.remove(sym) except ValueError: pass - + """ def loadString(self,string): self.currentDoc = self.parser(string) @@ -93,32 +90,124 @@ class TextCleaner: return " ".join([tok.text for tok in self.currentDoc if not tok.is_punct and tok.text not in symbols]) -#todo funzt irgendwie nich wie's soll: https://mathieularose.com/function-composition-in-python/ +def cleanDoc(doc, toDelete=None, toKeep=None): + """ + :param doc: spacyDoc + :param toDelete: [str] pos_ , ent_type_ or tag_ + :return: str tokenlist + """ + #keep + tokenlist = [] + for tok in doc: + if tok.pos_ in toKeep or tok.ent_type_ in toKeep or tok.tag_ in toKeep: + tokenlist.append(tok.text) + + #delete + tokenlist = [tok.text for tok in doc if tok.pos_ in toDelete and not tok.ent_type_ in toDelete and not tok.tag_ in toDelete] + + result = " ".join(tokenlist) + return result #problem: kein doc und daher nicht komponierbar + + +def keepinDoc(doc, toKeep=None): + """ + :param doc: spacyDoc + :param toDelete: [str] + :return: str tokenlist + """ + return " ".join([tok.text for tok in doc if tok.pos_ in toKeep or tok.ent_type_ in toKeep or tok.tag_ in toKeep]) + + +#todo https://mathieularose.com/function-composition-in-python/ parser = spacy.load('de') cleaner = TextCleaner(parser) -corpus = textacy.Corpus(parser) +corpus_raw = textacy.Corpus(parser) +corpus_clean = textacy.Corpus(parser) + +def foo(doc, toKeep=None): + + words = [tok.text for tok in doc if tok.pos_ in toKeep or tok.ent_type_ in toKeep or tok.tag_ in toKeep] + spaces = [True] * len(words) + + return Doc(doc.vocab,words=words,spaces=spaces) + +def foo2(doc, toDelete=None):#, toKeep=None): + """ + :param doc: spacyDoc + :param toDelete: [str] pos_ , ent_type_ or tag_ + :return: str tokenlist + """ + #keep + #tokenlist = [tok.text for tok in doc if tok.pos_ in toKeep or tok.ent_type_ in toKeep or tok.tag_ in toKeep] + + #delete + + words = [tok.text for tok in doc if tok.pos_ in toDelete and not tok.ent_type_ in toDelete and not tok.tag_ in toDelete] + spaces = [True] * len(words) + + return Doc(doc.vocab, words=words, spaces=spaces) +""" def compose(self,*functions): return functools.reduce(lambda f, g: lambda x: f(g(x)), functions, lambda x: x) def composeo(*functions): return functools.reduce(lambda f, g: lambda x: f(g(x)), functions) +""" -#pipeline = compose(functools.partial(removeWhitespace,lemmatize=True))#, cleaner.normalizeSynonyms) +def double(a): + return a*2 -pipeline = composeo(cleaner.removePunctuation, cleaner.removeWhitespace, cleaner.loadString) +def add(a, b): + return a+b +def compose(*functions): + def compose2(f, g): + return lambda x: f(g(x)) + return functools.reduce(compose2, functions, lambda x: x) + + + + + +#pipeline = compose(removeFromDoc, cleaner.removeWhitespace, cleaner.loadString) +""" def pipe1(string): cleaner.loadString(string) string = cleaner.removeWhitespace(string) string = cleaner.removePunctuation(string) return string +""" + +def cleaningPipe(spacy_pipe, composition): + for doc in spacy_pipe: + yield composition(doc) + + +pipeline = compose( + functools.partial(foo2, toDelete=["PUNCT", "SPACE"]), + functools.partial(foo, toKeep=["NOUN"])) string = "Frau Hinrichs überdenkt die tu Situation und 545453 macht ' dann neue Anträge. \n Dieses Ticket wird geschlossen \n \n test" -print(pipe1(string)) -corpus.add_text(pipeline(string)) -print(corpus[0].text) +doc = parser(string) + +#print(removeFromDoc(doc,toDelete=["PUNCT"])) + +print(pipeline(doc.text)) + + + +for txt in cleaningPipe(parser.pipe([string]),pipeline): + print(txt) +""" +corpus_raw.add_text(string) +for doc in parser.pipe([string]): + doc.text = removeFromDoc(doc, toDelete=["PUNCT"]) +""" + +#corpus_clean.add_texts(cleaningPipe(parser.pipe([string]),pipeline)) +#print(corpus_raw[0].text) From f09a261816420ee261b9b95cd03d9a18edf9e1f7 Mon Sep 17 00:00:00 2001 From: "jannis.grundmann" Date: Thu, 7 Sep 2017 14:59:59 +0200 Subject: [PATCH 5/9] composing mit text-stream angefangen --- textCleaning.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/textCleaning.py b/textCleaning.py index ef6a819..dae6afb 100644 --- a/textCleaning.py +++ b/textCleaning.py @@ -80,6 +80,8 @@ class TextCleaner: def removeWhitespace(self, string): return string + #self.whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE) + def removePunctuation(self, string, custom_symbols=None, keep=None): From 4dbb07ae3fcfd76128a954df8787731da934f566 Mon Sep 17 00:00:00 2001 From: "jannis.grundmann" Date: Mon, 11 Sep 2017 12:12:28 +0200 Subject: [PATCH 6/9] preprocessing erstmal soweit fertig. das mit der config wird noch verfeinert --- config.ini | 26 ++ old/preprocessing.py | 466 ++++++++++++++++++++++ test.py => old/test.py | 2 +- textCleaning.py => old/textCleaning.py | 15 +- openthesaurus.csv | 2 +- preprocessing.py | 519 ++++++++----------------- 6 files changed, 658 insertions(+), 372 deletions(-) create mode 100644 config.ini create mode 100644 old/preprocessing.py rename test.py => old/test.py (99%) rename textCleaning.py => old/textCleaning.py (99%) diff --git a/config.ini b/config.ini new file mode 100644 index 0000000..5612339 --- /dev/null +++ b/config.ini @@ -0,0 +1,26 @@ +[default] + +thesauruspath = openthesaurus.csv +path2xml = ticketSamples.xml + +language = de + +[preprocessing] + +ents = WORK_OF_ART,ORG,PRODUCT,LOC + +custom_words = grüßen,fragen + +#lemmatize = True + +default_return_first_Syn = False + + + + + + + + + + diff --git a/old/preprocessing.py b/old/preprocessing.py new file mode 100644 index 0000000..6bd8c3e --- /dev/null +++ b/old/preprocessing.py @@ -0,0 +1,466 @@ +# -*- coding: utf-8 -*- +import csv +import random +import sys + +import spacy +import textacy + +""" +import keras +import numpy as np +from keras.layers import Dense, SimpleRNN, LSTM, TimeDistributed, Dropout +from keras.models import Sequential +import keras.backend as K +""" +csv.field_size_limit(sys.maxsize) + +""" +def getFirstSynonym(word, thesaurus_gen): + + word = word.lower() + # TODO word cleaning https://stackoverflow.com/questions/3939361/remove-specific-characters-from-a-string-in-python + + + # durch den thesaurrus iterieren + for syn_block in thesaurus_gen: # syn_block ist eine liste mit Synonymen + + # durch den synonymblock iterieren + for syn in syn_block: + syn = syn.lower().split(" ") if not re.match(r'\A[\w-]+\Z', syn) else syn # aus synonym mach liste (um evtl. sätze zu identifieziren) + + # falls das wort in dem synonym enthalten ist (also == einem Wort in der liste ist) + if word in syn: + + # Hauptform suchen + if "auptform" in syn: + # nicht ausgeben, falls es in Klammern steht + for w in syn: + if not re.match(r'\([^)]+\)', w) and w is not None: + return w + + # falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht + if len(syn) == 1: + w = syn[0] + if not re.match(r'\([^)]+\)', w) and w is not None: + return w + + return word # zur Not die eingabe ausgeben + + +""" +""" +def cleanText(string,custom_stopwords=None, custom_symbols=None, custom_words=None, customPreprocessing=None, lemmatize=False, normalize_synonyms=False): + + # use preprocessing + if customPreprocessing is not None: + string = customPreprocessing(string) + + + + if custom_stopwords is not None: + custom_stopwords = custom_stopwords + else: + custom_stopwords = [] + + if custom_words is not None: + custom_words = custom_words + else: + custom_words = [] + + if custom_symbols is not None: + custom_symbols = custom_symbols + else: + custom_symbols = [] + + + # custom stoplist + # https://stackoverflow.com/questions/9806963/how-to-use-pythons-import-function-properly-import + stop_words = __import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS + + stoplist =list(stop_words) + custom_stopwords + # List of symbols we don't care about either + symbols = ["-----","---","...","“","”",".","-","<",">",",","?","!","..","n’t","n't","|","||",";",":","…","’s","'s",".","(",")","[","]","#"] + custom_symbols + + + + # get rid of newlines + string = string.strip().replace("\n", " ").replace("\r", " ") + + # replace twitter + mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE) + string = mentionFinder.sub("MENTION", string) + + # replace emails + emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE) + string = emailFinder.sub("EMAIL", string) + + # replace urls + urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE) + string = urlFinder.sub("URL", string) + + # replace HTML symbols + string = string.replace("&", "and").replace(">", ">").replace("<", "<") + + + + + # parse with spaCy + spacy_doc = PARSER(string) + tokens = [] + + added_entities = ["WORK_OF_ART","ORG","PRODUCT", "LOC"]#,"PERSON"] + added_POS = ["NOUN"]#, "NUM" ]#,"VERB","ADJ"] #IDEE NUM mit in den Corpus aufnehmen, aber fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013 + + # append Tokens to a list + for tok in spacy_doc: + if tok.pos_ in added_POS: + if lemmatize: + tokens.append(tok.lemma_.lower().strip()) + else: + tokens.append(tok.text.lower().strip()) + + # add entities + if tok.ent_type_ in added_entities: + tokens.append(tok.text.lower()) + + + + # remove stopwords + tokens = [tok for tok in tokens if tok not in stoplist] + + # remove symbols + tokens = [tok for tok in tokens if tok not in symbols] + + # remove custom_words + tokens = [tok for tok in tokens if tok not in custom_words] + + # remove single characters + tokens = [tok for tok in tokens if len(tok)>1] + + # remove large strings of whitespace + remove_large_strings_of_whitespace(" ".join(tokens)) + + + #idee abkürzungen auflösen (v.a. TU -> Technische Universität) + + if normalize_synonyms: + tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens] + + return " ".join(tokens) + + +def remove_large_strings_of_whitespace(sentence): + + whitespaceFinder = re.compile(r'(\r\n|\r|\n)', re.IGNORECASE) + sentence = whitespaceFinder.sub(" ", sentence) + + tokenlist = sentence.split(" ") + + while "" in tokenlist: + tokenlist.remove("") + while " " in tokenlist: + tokenlist.remove(" ") + + return " ".join(tokenlist) +""" +""" +def generateFromXML(path2xml, textfield='Beschreibung', clean=False, normalize_Synonyms=False,lemmatize=False): + import xml.etree.ElementTree as ET + + tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8")) + root = tree.getroot() + + for ticket in root: + metadata = {} + text = "ERROR" + for field in ticket: + if field.tag == textfield: + if clean: + text = cleanText_words(field.text,PARSER,normalize_synonyms=normalize_Synonyms,lemmatize=lemmatize) + else: + text = field.text + else: + #idee hier auch cleanen? + metadata[field.tag] = field.text + yield text, metadata +""" + + +LANGUAGE = 'de' +#PARSER = de_core_news_md.load() +PARSER = spacy.load(LANGUAGE) + +from old.textCleaning import TextCleaner + +cleaner = TextCleaner(parser=PARSER) + + +def generateTextfromTicketXML(path2xml, textfield='Beschreibung', clean=False, normalize_Synonyms=False, lemmatize=False): + import xml.etree.ElementTree as ET + + tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8")) + root = tree.getroot() + + + for ticket in root: + text = "ERROR" + for field in ticket: + if field.tag == textfield: + if clean: + text = cleaner.normalizeSynonyms(cleaner.removeWords(cleaner.keepPOSandENT(field.text))) #,normalize_synonyms=normalize_Synonyms,lemmatize=lemmatize) + else: + text = field.text + yield text + +def generateMetadatafromTicketXML(path2xml, textfield='Beschreibung'):#,keys_to_clean=["Loesung","Zusammenfassung"]): + import xml.etree.ElementTree as ET + + tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8")) + + root = tree.getroot() + + for ticket in root: + metadata = {} + for field in ticket: + if field.tag != textfield: + if field.tag == "Zusammenfassung": + metadata[field.tag] = cleaner.removePunctuation(field.text) + elif field.tag == "Loesung": + metadata[field.tag] = cleaner.removeWhitespace(field.text) + else: + metadata[field.tag] = field.text + + yield metadata + + + + +""" +def cleanText_symbols(string, parser=PARSER, custom_symbols=None, keep=None): + + if custom_symbols is not None: + custom_symbols = custom_symbols + else: + custom_symbols = [] + + if keep is not None: + keep = keep + else: + keep = [] + + # List of symbols we don't care about + symbols = ["-----","---","...","“","”",".","-","<",">",",","?","!","..","n’t","n't","|","||",";",":","…","’s","'s",".","(",")","[","]","#"] + custom_symbols + + # parse with spaCy + spacy_doc = parser(string) + tokens = [] + + pos = ["NUM", "SPACE", "PUNCT"] + for p in keep: + pos.remove(p) + + + # append Tokens to a list + for tok in spacy_doc: + if tok.pos_ not in pos and tok.text not in symbols: + tokens.append(tok.text) + + return " ".join(tokens) + +def cleanText_words(string,parser=PARSER, custom_stopwords=None, custom_words=None, customPreprocessing=cleanText_symbols, lemmatize=False, normalize_synonyms=False): + + # use preprocessing + if customPreprocessing is not None: + string = customPreprocessing(string) + + if custom_stopwords is not None: + custom_stopwords = custom_stopwords + else: + custom_stopwords = [] + + if custom_words is not None: + custom_words = custom_words + else: + custom_words = [] + + + # custom stoplist + # https://stackoverflow.com/questions/9806963/how-to-use-pythons-import-function-properly-import + stop_words = __import__("spacy." + parser.lang, globals(), locals(), ['object']).STOP_WORDS + + stoplist =list(stop_words) + custom_stopwords + + # replace twitter + mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE) + string = mentionFinder.sub("MENTION", string) + + # replace emails + emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE) + string = emailFinder.sub("EMAIL", string) + + # replace urls + urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE) + string = urlFinder.sub("URL", string) + + # replace HTML symbols + string = string.replace("&", "and").replace(">", ">").replace("<", "<") + + + + # parse with spaCy + spacy_doc = parser(string) + tokens = [] + + added_entities = ["WORK_OF_ART","ORG","PRODUCT", "LOC"]#,"PERSON"] + added_POS = ["NOUN"]#, "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013 + + # append Tokens to a list + for tok in spacy_doc: + if tok.pos_ in added_POS: + if lemmatize: + tokens.append(tok.lemma_.lower().strip()) + else: + tokens.append(tok.text.lower().strip()) + + # add entities + if tok.ent_type_ in added_entities: + tokens.append(tok.text.lower()) + + + + # remove stopwords + tokens = [tok for tok in tokens if tok not in stoplist] + + # remove custom_words + tokens = [tok for tok in tokens if tok not in custom_words] + + # remove single characters + tokens = [tok for tok in tokens if len(tok)>1] + + # remove large strings of whitespace + #remove_whitespace(" ".join(tokens)) + + + #idee abkürzungen auflösen (v.a. TU -> Technische Universität): abkürzungsverezeichnis + + if normalize_synonyms: + tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens] + + return " ".join(set(tokens)) + +def cleanText_removeWhitespace(sentence): + whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE) + sentence = whitespaceFinder.sub(" ", sentence) + return sentence + +#todo: preprocess pipe: removewhitespace, removePUNCT, resolveAbk, keepPOS, keepEnt, removeWords, normalizeSynonyms + + +def getFirstSynonym(word, thesaurus_gen): + + word = word.lower() + + + # durch den thesaurrus iterieren + for syn_block in thesaurus_gen: # syn_block ist eine liste mit Synonymen + + for syn in syn_block: + syn = syn.lower() + if re.match(r'\A[\w-]+\Z', syn): # falls syn einzelwort ist + if word == syn: + return getHauptform(syn_block, word) + else: # falls es ein satz ist + if word in syn: + return getHauptform(syn_block, word) + return word # zur Not, das ursrpüngliche Wort zurückgeben + +def getHauptform(syn_block, word, default_return_first_Syn=False): + + for syn in syn_block: + syn = syn.lower() + + if "hauptform" in syn and len(syn.split(" ")) <= 2: + # nicht ausgeben, falls es in Klammern steht + for w in syn.split(" "): + if not re.match(r'\([^)]+\)', w): + return w + + if default_return_first_Syn: + # falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht + for w in syn_block: + if not re.match(r'\([^)]+\)', w): + return w + return word # zur Not, das ursrpüngliche Wort zurückgeben +""" + +def printRandomDoc(textacyCorpus): + print() + + print("len(textacyCorpus) = %i" % len(textacyCorpus)) + randIndex = int((len(textacyCorpus) - 1) * random.random()) + print("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata)) + + print() + +####################'####################'####################'####################'####################'############## +# todo config-file + +DATAPATH = "ticketSamples.xml" +DATAPATH_thesaurus = "openthesaurus.csv" + + + +normalize_Synonyms = True +clean = True +lemmatize = True + +custom_words = ["grüßen", "fragen"] + +####################'####################'####################'####################'####################'############## + + +## files to textacy-corpus +textacyCorpus = textacy.Corpus(PARSER) + +print("add texts to textacy-corpus...") +textacyCorpus.add_texts(texts=generateTextfromTicketXML(DATAPATH, normalize_Synonyms=normalize_Synonyms, clean=clean, lemmatize=lemmatize), metadatas=generateMetadatafromTicketXML(DATAPATH)) + + +#for txt, dic in generateFromXML(DATAPATH, normalize_Synonyms=normalize_Synonyms, clean=clean, lemmatize=lemmatize): +# textacyCorpus.add_text(txt,dic) + + + +for doc in textacyCorpus: + print(doc.metadata) + print(doc.text) + +#print(textacyCorpus[2].text) +#printRandomDoc(textacyCorpus) +#print(textacyCorpus[len(textacyCorpus)-1].text) + + +print() +print() + + + + + + + + + + + + + + + + + + + + + diff --git a/test.py b/old/test.py similarity index 99% rename from test.py rename to old/test.py index 9560698..fc2ee00 100644 --- a/test.py +++ b/old/test.py @@ -118,7 +118,7 @@ def keepinDoc(doc, toKeep=None): return " ".join([tok.text for tok in doc if tok.pos_ in toKeep or tok.ent_type_ in toKeep or tok.tag_ in toKeep]) -#todo https://mathieularose.com/function-composition-in-python/ +# https://mathieularose.com/function-composition-in-python/ parser = spacy.load('de') cleaner = TextCleaner(parser) corpus_raw = textacy.Corpus(parser) diff --git a/textCleaning.py b/old/textCleaning.py similarity index 99% rename from textCleaning.py rename to old/textCleaning.py index dae6afb..da2fcd3 100644 --- a/textCleaning.py +++ b/old/textCleaning.py @@ -106,10 +106,6 @@ class TextCleaner: return " ".join(tokens) - def resolveAbbreviations(self,string): - return string #todo - - def keepPOSandENT(self, string, customPOS=None, customEnt=None, remove=None): pos2keep = self.pos2keep + (customPOS if customPOS is not None else []) @@ -142,6 +138,10 @@ class TextCleaner: + + + def resolveAbbreviations(self,string): + return string #todo def removeWords(self,string, custom_words=None, keep=None, lemmatize=False): wordlist = self.stop_words + (custom_words if custom_words is not None else []) @@ -176,11 +176,6 @@ class TextCleaner: return " ".join(set(tokens)) - - - - - def normalizeSynonyms(self, string, default_return_first_Syn=False): # parse with spaCy spacy_doc = self.parser(string) @@ -190,8 +185,6 @@ class TextCleaner: return " ".join(set(tokens)) - - def getFirstSynonym(self,word, thesaurus, default_return_first_Syn=False): if not isinstance(word, str): return word diff --git a/openthesaurus.csv b/openthesaurus.csv index caad708..e0c28df 100644 --- a/openthesaurus.csv +++ b/openthesaurus.csv @@ -1,5 +1,5 @@ -TH;Technische_Universität (Hauptform);Technische Hochschule;TU Passwort (Hauptform);Kodewort;Schlüsselwort;Zugangscode;Kennwort (Hauptform);Geheimcode;Losung;Codewort;Zugangswort;Losungswort;Parole +TH;Technische_Universität (Hauptform);Technische Hochschule;TU Fission;Kernfission;Kernspaltung;Atomspaltung Wiederaufnahme;Fortführung davonfahren;abdüsen (ugs.);aufbrechen;abfliegen;abfahren;(von etwas) fortfahren;abreisen;wegfahren;wegfliegen diff --git a/preprocessing.py b/preprocessing.py index 89b6317..70cb127 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -1,389 +1,190 @@ # -*- coding: utf-8 -*- import csv -import random +import functools import re - import spacy -import textacy import sys - +import textacy import xml.etree.ElementTree as ET -""" -import keras -import numpy as np -from keras.layers import Dense, SimpleRNN, LSTM, TimeDistributed, Dropout -from keras.models import Sequential -import keras.backend as K -""" +import io csv.field_size_limit(sys.maxsize) -""" -def getFirstSynonym(word, thesaurus_gen): - - word = word.lower() - # TODO word cleaning https://stackoverflow.com/questions/3939361/remove-specific-characters-from-a-string-in-python - # durch den thesaurrus iterieren - for syn_block in thesaurus_gen: # syn_block ist eine liste mit Synonymen - - # durch den synonymblock iterieren - for syn in syn_block: - syn = syn.lower().split(" ") if not re.match(r'\A[\w-]+\Z', syn) else syn # aus synonym mach liste (um evtl. sätze zu identifieziren) - - # falls das wort in dem synonym enthalten ist (also == einem Wort in der liste ist) - if word in syn: - - # Hauptform suchen - if "auptform" in syn: - # nicht ausgeben, falls es in Klammern steht - for w in syn: - if not re.match(r'\([^)]+\)', w) and w is not None: - return w - - # falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht - if len(syn) == 1: - w = syn[0] - if not re.match(r'\([^)]+\)', w) and w is not None: - return w - - return word # zur Not die eingabe ausgeben +# Load the configuration file +import configparser as ConfigParser +config = ConfigParser.ConfigParser() +with open("config.ini") as f: + config.read_file(f) -""" -""" -def cleanText(string,custom_stopwords=None, custom_symbols=None, custom_words=None, customPreprocessing=None, lemmatize=False, normalize_synonyms=False): +PARSER = spacy.load(config.get("default","language")) +corpus = textacy.Corpus(PARSER) - # use preprocessing - if customPreprocessing is not None: - string = customPreprocessing(string) +thesauruspath = config.get("default","thesauruspath") +THESAURUS = list(textacy.fileio.read_csv(thesauruspath, delimiter=";")) - if custom_stopwords is not None: - custom_stopwords = custom_stopwords - else: - custom_stopwords = [] - - if custom_words is not None: - custom_words = custom_words - else: - custom_words = [] - - if custom_symbols is not None: - custom_symbols = custom_symbols - else: - custom_symbols = [] +def compose(*functions): + def compose2(f, g): + return lambda x: f(g(x)) + return functools.reduce(compose2, functions, lambda x: x) - # custom stoplist - # https://stackoverflow.com/questions/9806963/how-to-use-pythons-import-function-properly-import - stop_words = __import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS +################ generate Content and Metadata ######################## - stoplist =list(stop_words) + custom_stopwords - # List of symbols we don't care about either - symbols = ["-----","---","...","“","”",".","-","<",">",",","?","!","..","n’t","n't","|","||",";",":","…","’s","'s",".","(",")","[","]","#"] + custom_symbols +def generateMainTextfromTicketXML(path2xml, main_textfield='Beschreibung'): + """ + generates strings from XML + :param path2xml: + :param main_textfield: + :param cleaning_function: + :yields strings + """ + tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8")) + root = tree.getroot() + for ticket in root: + for field in ticket: + if field.tag == main_textfield: + yield field.text - - # get rid of newlines - string = string.strip().replace("\n", " ").replace("\r", " ") - - # replace twitter - mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE) - string = mentionFinder.sub("MENTION", string) - - # replace emails - emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE) - string = emailFinder.sub("EMAIL", string) - - # replace urls - urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE) - string = urlFinder.sub("URL", string) - - # replace HTML symbols - string = string.replace("&", "and").replace(">", ">").replace("<", "<") - - - - - # parse with spaCy - spacy_doc = PARSER(string) - tokens = [] - - added_entities = ["WORK_OF_ART","ORG","PRODUCT", "LOC"]#,"PERSON"] - added_POS = ["NOUN"]#, "NUM" ]#,"VERB","ADJ"] #IDEE NUM mit in den Corpus aufnehmen, aber fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013 - - # append Tokens to a list - for tok in spacy_doc: - if tok.pos_ in added_POS: - if lemmatize: - tokens.append(tok.lemma_.lower().strip()) - else: - tokens.append(tok.text.lower().strip()) - - # add entities - if tok.ent_type_ in added_entities: - tokens.append(tok.text.lower()) - - - - # remove stopwords - tokens = [tok for tok in tokens if tok not in stoplist] - - # remove symbols - tokens = [tok for tok in tokens if tok not in symbols] - - # remove custom_words - tokens = [tok for tok in tokens if tok not in custom_words] - - # remove single characters - tokens = [tok for tok in tokens if len(tok)>1] - - # remove large strings of whitespace - remove_large_strings_of_whitespace(" ".join(tokens)) - - - #idee abkürzungen auflösen (v.a. TU -> Technische Universität) - - if normalize_synonyms: - tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens] - - return " ".join(tokens) - - -def remove_large_strings_of_whitespace(sentence): - - whitespaceFinder = re.compile(r'(\r\n|\r|\n)', re.IGNORECASE) - sentence = whitespaceFinder.sub(" ", sentence) - - tokenlist = sentence.split(" ") - - while "" in tokenlist: - tokenlist.remove("") - while " " in tokenlist: - tokenlist.remove(" ") - - return " ".join(tokenlist) -""" -""" -def generateFromXML(path2xml, textfield='Beschreibung', clean=False, normalize_Synonyms=False,lemmatize=False): - import xml.etree.ElementTree as ET - +def generateMetadatafromTicketXML(path2xml, leave_out=['Beschreibung']): tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8")) root = tree.getroot() for ticket in root: metadata = {} - text = "ERROR" for field in ticket: - if field.tag == textfield: - if clean: - text = cleanText_words(field.text,PARSER,normalize_synonyms=normalize_Synonyms,lemmatize=lemmatize) - else: - text = field.text - else: - #idee hier auch cleanen? - metadata[field.tag] = field.text - yield text, metadata -""" + if field.tag not in leave_out: - -LANGUAGE = 'de' -#PARSER = de_core_news_md.load() -PARSER = spacy.load(LANGUAGE) - -from textCleaning import TextCleaner - -cleaner = TextCleaner(parser=PARSER) - - -def generateTextfromTicketXML(path2xml, textfield='Beschreibung', clean=False, normalize_Synonyms=False, lemmatize=False): - import xml.etree.ElementTree as ET - - tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8")) - root = tree.getroot() - - - for ticket in root: - text = "ERROR" - for field in ticket: - if field.tag == textfield: - if clean: - text = cleaner.normalizeSynonyms(cleaner.removeWords(cleaner.keepPOSandENT(field.text))) #,normalize_synonyms=normalize_Synonyms,lemmatize=lemmatize) - else: - text = field.text - yield text - -def generateMetadatafromTicketXML(path2xml, textfield='Beschreibung'):#,keys_to_clean=["Loesung","Zusammenfassung"]): - import xml.etree.ElementTree as ET - - tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8")) - - root = tree.getroot() - - for ticket in root: - metadata = {} - for field in ticket: - if field.tag != textfield: - if field.tag == "Zusammenfassung": - metadata[field.tag] = cleaner.removePunctuation(field.text) - elif field.tag == "Loesung": - metadata[field.tag] = cleaner.removeWhitespace(field.text) - else: metadata[field.tag] = field.text yield metadata +def printRandomDoc(textacyCorpus): + import random + print() + + print("len(textacyCorpus) = %i" % len(textacyCorpus)) + randIndex = int((len(textacyCorpus) - 1) * random.random()) + print("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata)) + + print() + +################ Preprocess######################### + +def processDictstream(dictstream, funcdict, parser=PARSER): + for dic in dictstream: + result = {} + for key, value in dic.items(): + if key in funcdict: + result[key] = funcdict[key](parser(value)) + else: + result[key] = key + yield result + +def processTextstream(textstream, func, parser=PARSER): + # input str-stream output str-stream + pipe = parser.pipe(textstream) + + for doc in pipe: + yield func(doc) -""" -def cleanText_symbols(string, parser=PARSER, custom_symbols=None, keep=None): - if custom_symbols is not None: - custom_symbols = custom_symbols - else: - custom_symbols = [] +def keepOnlyPOS(pos_list, parser=PARSER): + return lambda doc : parser(" ".join([tok.text for tok in doc if tok.pos_ in pos_list])) - if keep is not None: - keep = keep - else: - keep = [] +def removeAllPOS(pos_list, parser=PARSER): + return lambda doc: parser(" ".join([tok.text for tok in doc if tok.pos_ not in pos_list])) - # List of symbols we don't care about - symbols = ["-----","---","...","“","”",".","-","<",">",",","?","!","..","n’t","n't","|","||",";",":","…","’s","'s",".","(",")","[","]","#"] + custom_symbols +def keepOnlyENT(ent_list,parser=PARSER): + return lambda doc: parser(" ".join([tok.text for tok in doc if tok.ent_type_ in ent_list])) - # parse with spaCy - spacy_doc = parser(string) - tokens = [] - - pos = ["NUM", "SPACE", "PUNCT"] - for p in keep: - pos.remove(p) - - - # append Tokens to a list - for tok in spacy_doc: - if tok.pos_ not in pos and tok.text not in symbols: - tokens.append(tok.text) - - return " ".join(tokens) - -def cleanText_words(string,parser=PARSER, custom_stopwords=None, custom_words=None, customPreprocessing=cleanText_symbols, lemmatize=False, normalize_synonyms=False): - - # use preprocessing - if customPreprocessing is not None: - string = customPreprocessing(string) - - if custom_stopwords is not None: - custom_stopwords = custom_stopwords - else: - custom_stopwords = [] - - if custom_words is not None: - custom_words = custom_words - else: - custom_words = [] - - - # custom stoplist - # https://stackoverflow.com/questions/9806963/how-to-use-pythons-import-function-properly-import - stop_words = __import__("spacy." + parser.lang, globals(), locals(), ['object']).STOP_WORDS - - stoplist =list(stop_words) + custom_stopwords - - # replace twitter - mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE) - string = mentionFinder.sub("MENTION", string) - - # replace emails - emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE) - string = emailFinder.sub("EMAIL", string) - - # replace urls - urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE) - string = urlFinder.sub("URL", string) - - # replace HTML symbols - string = string.replace("&", "and").replace(">", ">").replace("<", "<") +def removeAllENT(ent_list, parser=PARSER): + return lambda doc: parser(" ".join([tok.text for tok in doc if tok.ent_type_ not in ent_list])) - # parse with spaCy - spacy_doc = parser(string) - tokens = [] - added_entities = ["WORK_OF_ART","ORG","PRODUCT", "LOC"]#,"PERSON"] - added_POS = ["NOUN"]#, "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013 +doc2Set = lambda doc: str(set([tok.text for tok in doc])) +doc2String = lambda doc : doc.text - # append Tokens to a list - for tok in spacy_doc: - if tok.pos_ in added_POS: - if lemmatize: - tokens.append(tok.lemma_.lower().strip()) - else: - tokens.append(tok.text.lower().strip()) - # add entities - if tok.ent_type_ in added_entities: - tokens.append(tok.text.lower()) +mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE) +emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE) +urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE) + +def replaceURLs(replace_with="URL",parser=PARSER): + #return lambda doc: parser(textacy.preprocess.replace_urls(doc.text,replace_with=replace_with)) + return lambda doc: parser(urlFinder.sub(replace_with,doc.text)) + +def replaceEmails(replace_with="EMAIL",parser=PARSER): + #return lambda doc: parser(textacy.preprocess.replace_emails(doc.text,replace_with=replace_with)) + return lambda doc : parser(emailFinder.sub(replace_with, doc.text)) + +def replaceTwitterMentions(replace_with="TWITTER_MENTION",parser=PARSER): + return lambda doc : parser(mentionFinder.sub(replace_with, doc.text)) + +def replaceNumbers(replace_with="NUMBER",parser=PARSER): + return lambda doc: parser(textacy.preprocess.replace_numbers(doc.text, replace_with=replace_with)) + +def replacePhonenumbers(replace_with="PHONE",parser=PARSER): + return lambda doc: parser(textacy.preprocess.replace_phone_numbers(doc.text, replace_with=replace_with)) - # remove stopwords - tokens = [tok for tok in tokens if tok not in stoplist] - - # remove custom_words - tokens = [tok for tok in tokens if tok not in custom_words] - - # remove single characters - tokens = [tok for tok in tokens if len(tok)>1] - - # remove large strings of whitespace - #remove_whitespace(" ".join(tokens)) - #idee abkürzungen auflösen (v.a. TU -> Technische Universität): abkürzungsverezeichnis - if normalize_synonyms: - tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens] - - return " ".join(set(tokens)) - -def cleanText_removeWhitespace(sentence): - whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE) - sentence = whitespaceFinder.sub(" ", sentence) - return sentence - -#todo: preprocess pipe: removewhitespace, removePUNCT, resolveAbk, keepPOS, keepEnt, removeWords, normalizeSynonyms +def resolveAbbreviations(parser=PARSER): + pass #todo -def getFirstSynonym(word, thesaurus_gen): +def removeWords(words, keep=None,parser=PARSER): + if hasattr(keep, '__iter__'): + for k in keep: + try: + words.remove(k) + except ValueError: + pass + return lambda doc : parser(" ".join([tok.text for tok in doc if tok.lower_ not in words])) + + + + +def normalizeSynonyms(default_return_first_Syn=False, parser=PARSER): + #return lambda doc : parser(" ".join([tok.lower_ for tok in doc])) + return lambda doc : parser(" ".join([getFirstSynonym(tok.lower_, THESAURUS, default_return_first_Syn=default_return_first_Syn) for tok in doc])) + +def getFirstSynonym(word, thesaurus, default_return_first_Syn=False): + if not isinstance(word, str): + return str(word) word = word.lower() - # durch den thesaurrus iterieren - for syn_block in thesaurus_gen: # syn_block ist eine liste mit Synonymen + for syn_block in thesaurus: # syn_block ist eine liste mit Synonymen for syn in syn_block: syn = syn.lower() if re.match(r'\A[\w-]+\Z', syn): # falls syn einzelwort ist if word == syn: - return getHauptform(syn_block, word) + return str(getHauptform(syn_block, word, default_return_first_Syn=default_return_first_Syn)) else: # falls es ein satz ist if word in syn: - return getHauptform(syn_block, word) - return word # zur Not, das ursrpüngliche Wort zurückgeben + return str(getHauptform(syn_block, word, default_return_first_Syn=default_return_first_Syn)) + return str(word) # zur Not, das ursrpüngliche Wort zurückgeben def getHauptform(syn_block, word, default_return_first_Syn=False): - for syn in syn_block: syn = syn.lower() if "hauptform" in syn and len(syn.split(" ")) <= 2: - # nicht ausgeben, falls es in Klammern steht + # nicht ausgeben, falls es in Klammern steht#todo gibts macnmal?? klammern aus for w in syn.split(" "): if not re.match(r'\([^)]+\)', w): return w @@ -394,58 +195,58 @@ def getHauptform(syn_block, word, default_return_first_Syn=False): if not re.match(r'\([^)]+\)', w): return w return word # zur Not, das ursrpüngliche Wort zurückgeben -""" - -def printRandomDoc(textacyCorpus): - print() - - print("len(textacyCorpus) = %i" % len(textacyCorpus)) - randIndex = int((len(textacyCorpus) - 1) * random.random()) - print("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata)) - - print() - -####################'####################'####################'####################'####################'############## -# todo config-file - -import de_core_news_md -DATAPATH = "ticketSamples.xml" -DATAPATH_thesaurus = "openthesaurus.csv" -normalize_Synonyms = True -clean = True -lemmatize = True +stop_words=list(__import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS) + config.get("preprocessing","custom_words").split(",") + +path2xml = config.get("default","path2xml") + + + +content_generator = generateMainTextfromTicketXML(path2xml) +metadata_generator = generateMetadatafromTicketXML(path2xml) + + + +ents = config.get("preprocessing","ents").split(",") + + + +clean_in_content=compose( + + doc2String, + #normalizeSynonyms(default_return_first_Syn=config.get("preprocessing","default_return_first_Syn")), + replaceEmails(), + replaceURLs(), + replaceTwitterMentions(), + removeWords(stop_words), + #removeAllPOS(["SPACE","PUNCT"]), + #removeAllENT(ents), + keepOnlyPOS(['NOUN']) +) + +clean_in_meta = { + "Loesung":removeAllPOS(["SPACE"]), + "Zusammenfassung":removeAllPOS(["SPACE","PUNCT"]) +} + + +contentStream = processTextstream(content_generator, func=clean_in_content) +metaStream = processDictstream(metadata_generator, funcdict=clean_in_meta) + + +corpus.add_texts(contentStream,metaStream) +print(corpus[0].text) +printRandomDoc(corpus) + + -custom_words = ["grüßen", "fragen"] - -####################'####################'####################'####################'####################'############## - - -## files to textacy-corpus -textacyCorpus = textacy.Corpus(PARSER) - -print("add texts to textacy-corpus...") -textacyCorpus.add_texts(texts=generateTextfromTicketXML(DATAPATH, normalize_Synonyms=normalize_Synonyms, clean=clean, lemmatize=lemmatize), metadatas=generateMetadatafromTicketXML(DATAPATH)) - - -#for txt, dic in generateFromXML(DATAPATH, normalize_Synonyms=normalize_Synonyms, clean=clean, lemmatize=lemmatize): -# textacyCorpus.add_text(txt,dic) -for doc in textacyCorpus: - print(doc.metadata) - print(doc.text) - -#print(textacyCorpus[2].text) -#printRandomDoc(textacyCorpus) -#print(textacyCorpus[len(textacyCorpus)-1].text) -print() -print() From 991353b1bbac0428adf57b05c22704aee1a64688 Mon Sep 17 00:00:00 2001 From: "jannis.grundmann" Date: Mon, 11 Sep 2017 13:00:03 +0200 Subject: [PATCH 7/9] =?UTF-8?q?unsupervised=20und=20supervised-topic-train?= =?UTF-8?q?ing=20eingebaut.=20sollte=20man=20jez=20auf=20den=20datensatz?= =?UTF-8?q?=20loslassen=20k=C3=B6nnen?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- preprocessing.py | 177 +++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 148 insertions(+), 29 deletions(-) diff --git a/preprocessing.py b/preprocessing.py index 70cb127..61c1709 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -1,12 +1,17 @@ # -*- coding: utf-8 -*- import csv import functools +import os.path import re -import spacy -import sys -import textacy +import subprocess +import time import xml.etree.ElementTree as ET -import io + +import spacy +import textacy +from scipy import * +from textacy import Vectorizer + csv.field_size_limit(sys.maxsize) @@ -18,12 +23,16 @@ with open("config.ini") as f: config.read_file(f) + +path2xml = config.get("default","path2xml") + PARSER = spacy.load(config.get("default","language")) corpus = textacy.Corpus(PARSER) thesauruspath = config.get("default","thesauruspath") THESAURUS = list(textacy.fileio.read_csv(thesauruspath, delimiter=";")) +stop_words=list(__import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS) + config.get("preprocessing","custom_words").split(",") def compose(*functions): @@ -32,7 +41,6 @@ def compose(*functions): return functools.reduce(compose2, functions, lambda x: x) -################ generate Content and Metadata ######################## def generateMainTextfromTicketXML(path2xml, main_textfield='Beschreibung'): """ @@ -73,8 +81,6 @@ def printRandomDoc(textacyCorpus): print() -################ Preprocess######################### - def processDictstream(dictstream, funcdict, parser=PARSER): for dic in dictstream: result = {} @@ -82,7 +88,7 @@ def processDictstream(dictstream, funcdict, parser=PARSER): if key in funcdict: result[key] = funcdict[key](parser(value)) else: - result[key] = key + result[key] = value yield result def processTextstream(textstream, func, parser=PARSER): @@ -109,7 +115,6 @@ def removeAllENT(ent_list, parser=PARSER): - doc2Set = lambda doc: str(set([tok.text for tok in doc])) doc2String = lambda doc : doc.text @@ -137,13 +142,9 @@ def replacePhonenumbers(replace_with="PHONE",parser=PARSER): - - - def resolveAbbreviations(parser=PARSER): pass #todo - def removeWords(words, keep=None,parser=PARSER): if hasattr(keep, '__iter__'): for k in keep: @@ -155,7 +156,6 @@ def removeWords(words, keep=None,parser=PARSER): - def normalizeSynonyms(default_return_first_Syn=False, parser=PARSER): #return lambda doc : parser(" ".join([tok.lower_ for tok in doc])) return lambda doc : parser(" ".join([getFirstSynonym(tok.lower_, THESAURUS, default_return_first_Syn=default_return_first_Syn) for tok in doc])) @@ -196,23 +196,27 @@ def getHauptform(syn_block, word, default_return_first_Syn=False): return w return word # zur Not, das ursrpüngliche Wort zurückgeben +def label2ID(label): + return { + 'Neuanschluss' : 0, + 'LSF' : 1, + 'Video' : 2, + }.get(label,3) - -stop_words=list(__import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS) + config.get("preprocessing","custom_words").split(",") - -path2xml = config.get("default","path2xml") +def generate_labled_lines(textacyCorpus): + for doc in textacyCorpus: + # generate [topic1, topic2....] tok1 tok2 tok3 out of corpus + yield "[" + str(label2ID(doc.metadata["Kategorie"])) + "] " + doc.text -content_generator = generateMainTextfromTicketXML(path2xml) -metadata_generator = generateMetadatafromTicketXML(path2xml) + +####################'####################'####################'####################'####################'############## ents = config.get("preprocessing","ents").split(",") - - clean_in_content=compose( doc2String, @@ -232,19 +236,134 @@ clean_in_meta = { } -contentStream = processTextstream(content_generator, func=clean_in_content) -metaStream = processDictstream(metadata_generator, funcdict=clean_in_meta) -corpus.add_texts(contentStream,metaStream) -print(corpus[0].text) + + + +## add files to textacy-corpus, +print("add texts to textacy-corpus...") +corpus.add_texts( + processTextstream(generateMainTextfromTicketXML(path2xml), func=clean_in_content), + processDictstream(generateMetadatafromTicketXML(path2xml), funcdict=clean_in_meta) +) + printRandomDoc(corpus) - - - +####################'####################' Variablen todo alles in config + +ngrams = (1,2) + +min_df = 0 +max_df = 1.0 +no_below = 20 +no_above = 0.5 + +topicModel = 'lda' +# http://textacy.readthedocs.io/en/latest/api_reference.html#textacy.tm.topic_model.TopicModel.get_doc_topic_matrix +weighting = ('tf' if topicModel == 'lda' else 'tfidf') + +top_topic_words = 5 +top_document_labels_per_topic = 2 + +n_topics = 4 + + + + + + +####################'#################### + + + + +print("vectorize corpus...") +vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df) + +terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=False, as_strings=True) for doc in corpus) +doc_term_matrix = vectorizer.fit_transform(terms_list) +id2term = vectorizer.__getattribute__("id_to_term") + + + + + + + + + + + +##################### LSA, LDA, NMF Topic Modeling via Textacy ############################################## + +# Initialize and train a topic model +print("Initialize and train a topic model") +model = textacy.tm.TopicModel(topicModel, n_topics=n_topics) +model.fit(doc_term_matrix) + +#Transform the corpus and interpret our model: +print("Transform the corpus and interpret our model") +doc_topic_matrix = model.transform(doc_term_matrix) +print() + + +for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_topic_words): + print('topic', topic_idx, ':', ' '.join(top_terms)) + + +print() +for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_document_labels_per_topic): + print(topic_idx) + for j in top_docs: + print(corpus[j].metadata['Kategorie']) + +##################################################################################################################### +print() +print() + + + + + +##################### LLDA Topic Modeling via JGibbsLabledLDA ############################################## + + + +jgibbsLLDA_root = "java_LabledLDA/" +filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root) + + +#create file +textacy.fileio.write_file_lines(generate_labled_lines(corpus),filepath=filepath) + + +# wait for file to exist +while not os.path.exists(filepath): + time.sleep(1) + +print("start LLDA..") +#run JGibsslda file +FNULL = open(os.devnull, 'w') # supress output +subprocess.call(["java", + "-cp", "{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format(jgibbsLLDA_root), + "jgibblda.LDA", + "-est", + "-dir", "{0}models/tickets".format(jgibbsLLDA_root), + "-dfile","tickets.gz", + "-ntopics", str(n_topics)], stdout = FNULL) + +# ANMERKUNG: Dateien sind versteckt. zu finden in models/ + +#print twords +subprocess.call(["gzip", + "-dc", + "{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)]) +##################################################################################################################### +print() +print() From 67e6f8845c519428f340044c3814a84b15cf31de Mon Sep 17 00:00:00 2001 From: "jannis.grundmann" Date: Mon, 11 Sep 2017 13:03:20 +0200 Subject: [PATCH 8/9] fehlende ordner geadded --- .gitignore | 2 + java_LabledLDA/.idea/libraries/lib.xml | 10 + java_LabledLDA/.idea/misc.xml | 6 + java_LabledLDA/.idea/modules.xml | 8 + java_LabledLDA/.idea/workspace.xml | 439 ++++++++++++ java_LabledLDA/LICENSE | 339 +++++++++ java_LabledLDA/LabledLDA.iml | 13 + java_LabledLDA/README.md | 109 +++ java_LabledLDA/models/tickets/.others.gz | Bin 0 -> 76 bytes java_LabledLDA/models/tickets/.tassign.gz | Bin 0 -> 254 bytes java_LabledLDA/models/tickets/.theta.gz | Bin 0 -> 137 bytes java_LabledLDA/models/tickets/.twords.gz | Bin 0 -> 2277 bytes java_LabledLDA/models/tickets/.wordmap.gz | Bin 0 -> 697 bytes java_LabledLDA/models/tickets/tickets.gz | Bin 0 -> 780 bytes .../LabledLDA/jgibblda/Dictionary.class | Bin 0 -> 3450 bytes .../LabledLDA/jgibblda/Document.class | Bin 0 -> 1247 bytes .../LabledLDA/jgibblda/Estimator.class | Bin 0 -> 3383 bytes .../LabledLDA/jgibblda/Inferencer.class | Bin 0 -> 4418 bytes .../production/LabledLDA/jgibblda/LDA.class | Bin 0 -> 2129 bytes .../LabledLDA/jgibblda/LDACmdOption.class | Bin 0 -> 1969 bytes .../LabledLDA/jgibblda/LDADataset.class | Bin 0 -> 4778 bytes .../production/LabledLDA/jgibblda/Model.class | Bin 0 -> 14435 bytes .../production/LabledLDA/jgibblda/Pair.class | Bin 0 -> 1097 bytes java_LabledLDA/src/jgibblda/Dictionary.java | 150 ++++ java_LabledLDA/src/jgibblda/Document.java | 62 ++ java_LabledLDA/src/jgibblda/Estimator.java | 145 ++++ java_LabledLDA/src/jgibblda/Inferencer.java | 178 +++++ java_LabledLDA/src/jgibblda/LDA.java | 78 ++ java_LabledLDA/src/jgibblda/LDACmdOption.java | 51 ++ java_LabledLDA/src/jgibblda/LDADataset.java | 179 +++++ java_LabledLDA/src/jgibblda/Model.java | 669 ++++++++++++++++++ java_LabledLDA/src/jgibblda/Pair.java | 54 ++ old/testo.py | 199 ++++++ 33 files changed, 2691 insertions(+) create mode 100644 java_LabledLDA/.idea/libraries/lib.xml create mode 100644 java_LabledLDA/.idea/misc.xml create mode 100644 java_LabledLDA/.idea/modules.xml create mode 100644 java_LabledLDA/.idea/workspace.xml create mode 100644 java_LabledLDA/LICENSE create mode 100644 java_LabledLDA/LabledLDA.iml create mode 100644 java_LabledLDA/README.md create mode 100644 java_LabledLDA/models/tickets/.others.gz create mode 100644 java_LabledLDA/models/tickets/.tassign.gz create mode 100644 java_LabledLDA/models/tickets/.theta.gz create mode 100644 java_LabledLDA/models/tickets/.twords.gz create mode 100644 java_LabledLDA/models/tickets/.wordmap.gz create mode 100644 java_LabledLDA/models/tickets/tickets.gz create mode 100644 java_LabledLDA/out/production/LabledLDA/jgibblda/Dictionary.class create mode 100644 java_LabledLDA/out/production/LabledLDA/jgibblda/Document.class create mode 100644 java_LabledLDA/out/production/LabledLDA/jgibblda/Estimator.class create mode 100644 java_LabledLDA/out/production/LabledLDA/jgibblda/Inferencer.class create mode 100644 java_LabledLDA/out/production/LabledLDA/jgibblda/LDA.class create mode 100644 java_LabledLDA/out/production/LabledLDA/jgibblda/LDACmdOption.class create mode 100644 java_LabledLDA/out/production/LabledLDA/jgibblda/LDADataset.class create mode 100644 java_LabledLDA/out/production/LabledLDA/jgibblda/Model.class create mode 100644 java_LabledLDA/out/production/LabledLDA/jgibblda/Pair.class create mode 100644 java_LabledLDA/src/jgibblda/Dictionary.java create mode 100644 java_LabledLDA/src/jgibblda/Document.java create mode 100644 java_LabledLDA/src/jgibblda/Estimator.java create mode 100644 java_LabledLDA/src/jgibblda/Inferencer.java create mode 100644 java_LabledLDA/src/jgibblda/LDA.java create mode 100644 java_LabledLDA/src/jgibblda/LDACmdOption.java create mode 100644 java_LabledLDA/src/jgibblda/LDADataset.java create mode 100644 java_LabledLDA/src/jgibblda/Model.java create mode 100644 java_LabledLDA/src/jgibblda/Pair.java create mode 100644 old/testo.py diff --git a/.gitignore b/.gitignore index d2a74a9..5fdb4fb 100644 --- a/.gitignore +++ b/.gitignore @@ -26,6 +26,8 @@ wheels/ *.egg-info/ .installed.cfg *.egg +openthesaurus.csv + # PyInstaller # Usually these files are written by a python script from a template diff --git a/java_LabledLDA/.idea/libraries/lib.xml b/java_LabledLDA/.idea/libraries/lib.xml new file mode 100644 index 0000000..fa8838a --- /dev/null +++ b/java_LabledLDA/.idea/libraries/lib.xml @@ -0,0 +1,10 @@ + + + + + + + + + + \ No newline at end of file diff --git a/java_LabledLDA/.idea/misc.xml b/java_LabledLDA/.idea/misc.xml new file mode 100644 index 0000000..5182650 --- /dev/null +++ b/java_LabledLDA/.idea/misc.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/java_LabledLDA/.idea/modules.xml b/java_LabledLDA/.idea/modules.xml new file mode 100644 index 0000000..3d6ae4f --- /dev/null +++ b/java_LabledLDA/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/java_LabledLDA/.idea/workspace.xml b/java_LabledLDA/.idea/workspace.xml new file mode 100644 index 0000000..d2f2495 --- /dev/null +++ b/java_LabledLDA/.idea/workspace.xml @@ -0,0 +1,439 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true + DEFINITION_ORDER + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +