diff --git a/openthesaurus.csv b/openthesaurus.csv index a2348f7..caad708 100644 --- a/openthesaurus.csv +++ b/openthesaurus.csv @@ -1,3 +1,5 @@ +TH;Technische_Universität (Hauptform);Technische Hochschule;TU +Passwort (Hauptform);Kodewort;Schlüsselwort;Zugangscode;Kennwort (Hauptform);Geheimcode;Losung;Codewort;Zugangswort;Losungswort;Parole Fission;Kernfission;Kernspaltung;Atomspaltung Wiederaufnahme;Fortführung davonfahren;abdüsen (ugs.);aufbrechen;abfliegen;abfahren;(von etwas) fortfahren;abreisen;wegfahren;wegfliegen @@ -2182,7 +2184,6 @@ Spitzenklöppel (Handarbeit);Glockenklöppel;Klöppel gutartig;benigne (fachspr.) Beutelratte;Taschenratte rollen;kollern (ugs.);kullern;kugeln -Kodewort;Schlüsselwort;Zugangscode;Kennwort (Hauptform);Geheimcode;Losung;Codewort;Zugangswort;Passwort (Hauptform);Losungswort;Parole packen;einpacken Ratschluss;Urteil;Wille;Entscheidung;Entschlossenheit;Beschluss;das letzte Wort (ugs.);Entschluss;Entscheid (schweiz.) dreckig machen;versiffen;beschmutzen;verschmutzen @@ -4207,7 +4208,6 @@ Akzise;Oktroi;Verbrauchsabgabe Aufrührer;Tumultant genügsam;bedürfnislos zeigen;offenbaren;bekunden;kundtun -TH;Technische Universität;Technische Hochschule;TU Versprechen;Absichtserklärung (Nachrichtensprache);Zusicherung;Versicherung;Beteuerung Beschaulichkeit;Stille Auswärtiges Amt;Außenamt (ugs.);Außenministerium (ugs.);AA;Ministerium für Auswärtige Angelegenheiten diff --git a/preprocessing.py b/preprocessing.py index e9f5275..536f426 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -17,15 +17,7 @@ import keras.backend as K """ csv.field_size_limit(sys.maxsize) - -def printRandomDoc(textacyCorpus): - print() - print("len(textacyCorpus) = %i" % len(textacyCorpus)) - randIndex = int((len(textacyCorpus) - 1) * random.random()) - print("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata)) - print() - - +""" def getFirstSynonym(word, thesaurus_gen): word = word.lower() @@ -58,8 +50,9 @@ def getFirstSynonym(word, thesaurus_gen): return word # zur Not die eingabe ausgeben -def cleanText(string,custom_stopwords=None, custom_symbols=None, custom_words=None, customPreprocessing=None, lemmatize=False): - import re +""" +""" +def cleanText(string,custom_stopwords=None, custom_symbols=None, custom_words=None, customPreprocessing=None, lemmatize=False, normalize_synonyms=False): # use preprocessing if customPreprocessing is not None: @@ -119,7 +112,7 @@ def cleanText(string,custom_stopwords=None, custom_symbols=None, custom_words=No tokens = [] added_entities = ["WORK_OF_ART","ORG","PRODUCT", "LOC"]#,"PERSON"] - added_POS = ["NOUN", "NUM" ]#,"VERB","ADJ"] #IDEE NUM mit in den Corpus aufnehmen, aber fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013 + added_POS = ["NOUN"]#, "NUM" ]#,"VERB","ADJ"] #IDEE NUM mit in den Corpus aufnehmen, aber fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013 # append Tokens to a list for tok in spacy_doc: @@ -148,55 +141,33 @@ def cleanText(string,custom_stopwords=None, custom_symbols=None, custom_words=No tokens = [tok for tok in tokens if len(tok)>1] # remove large strings of whitespace - while "" in tokens: - tokens.remove("") - while " " in tokens: - tokens.remove(" ") - while "\n" in tokens: - tokens.remove("\n") - while "\n\n" in tokens: - tokens.remove("\n\n") - """ - tokenz = [] - for tok in tokens: - tokenz.append(str(getFirstSynonym(tok,THESAURUS_gen))) - tokens = tokenz - """ - tokens = [str(getFirstSynonym(tok,THESAURUS_gen)) for tok in tokens] + remove_large_strings_of_whitespace(" ".join(tokens)) + + + #idee abkürzungen auflösen (v.a. TU -> Technische Universität) + + if normalize_synonyms: + tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens] return " ".join(tokens) -def generateTextfromXML(path2xml, clean=True, textfield='Beschreibung'): - import xml.etree.ElementTree as ET +def remove_large_strings_of_whitespace(sentence): - tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8")) + whitespaceFinder = re.compile(r'(\r\n|\r|\n)', re.IGNORECASE) + sentence = whitespaceFinder.sub(" ", sentence) - root = tree.getroot() + tokenlist = sentence.split(" ") - for subject in root.iter(textfield): - if clean: - yield cleanText(subject.text) - else: - yield subject.text + while "" in tokenlist: + tokenlist.remove("") + while " " in tokenlist: + tokenlist.remove(" ") -def generateMetadatafromXML(path2xml, keys=["Loesung","Kategorie","Zusammenfassung"]): - import xml.etree.ElementTree as ET - - tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8")) - - root = tree.getroot() - - metadata = dict.fromkeys(keys) - - - for ticket in root.findall('ticket'): - for key in metadata: - metadata[key] = ticket.find(key).text - - yield metadata - -def generateFromXML(path2xml, clean=True, textfield='Beschreibung'): + return " ".join(tokenlist) +""" +""" +def generateFromXML(path2xml, textfield='Beschreibung', clean=False, normalize_Synonyms=False,lemmatize=False): import xml.etree.ElementTree as ET tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8")) @@ -208,58 +179,291 @@ def generateFromXML(path2xml, clean=True, textfield='Beschreibung'): for field in ticket: if field.tag == textfield: if clean: - text = cleanText(field.text) + text = cleanText_words(field.text,PARSER,normalize_synonyms=normalize_Synonyms,lemmatize=lemmatize) else: text = field.text else: + #idee hier auch cleanen? metadata[field.tag] = field.text yield text, metadata +""" +LANGUAGE = 'de' +PARSER = spacy.load(LANGUAGE) + + + +def generateTextfromXML(path2xml, textfield='Beschreibung', clean=False, normalize_Synonyms=False,lemmatize=False): + import xml.etree.ElementTree as ET + + tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8")) + + root = tree.getroot() + """ + for subject in root.iter(textfield): + if clean: + yield cleanText(subject.text) + else: + yield subject.text + """ + for ticket in root: + text = "ERROR" + for field in ticket: + if field.tag == textfield: + if clean: + text = cleanText_words(field.text,normalize_synonyms=normalize_Synonyms,lemmatize=lemmatize) + else: + text = field.text + yield text + +def generateMetadatafromXML(path2xml, textfield='Beschreibung'):#,keys_to_clean=["Loesung","Zusammenfassung"]): + import xml.etree.ElementTree as ET + + tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8")) + + root = tree.getroot() + + """ + metadata = dict.fromkeys(keys) + + for ticket in root.findall('ticket'): + for key in metadata: + metadata[key] = ticket.find(key).text + + yield metadata + """ + for ticket in root: + metadata = {} + for field in ticket: + if field.tag != textfield: + if field.tag == "Zusammenfassung": + # idee lösung nur whitespace entfernen, zusammenfassung auch von symbolen befreien + metadata[field.tag] = cleanText_symbols(field.text) + elif field.tag == "Loesung": + metadata[field.tag] = remove_whitespace(field.text) + else: + metadata[field.tag] = field.text + + yield metadata + + +def cleanText_symbols(string, parser=PARSER, custom_symbols=None, keep=None): + """ + https://spacy.io/docs/usage/pos-tagging + + cleans text from PUNCT, NUM, whitespaces, newlines, and the following list of symbols: + + ["-----","---","...","“","”",".","-","<",">",",","?","!","..","n’t","n't","|","||",";",":","…","’s","'s",".","(",")","[","]","#"] + + """ + if custom_symbols is not None: + custom_symbols = custom_symbols + else: + custom_symbols = [] + + if keep is not None: + keep = keep + else: + keep = [] + + # List of symbols we don't care about + symbols = ["-----","---","...","“","”",".","-","<",">",",","?","!","..","n’t","n't","|","||",";",":","…","’s","'s",".","(",")","[","]","#"] + custom_symbols + + # parse with spaCy + spacy_doc = parser(string) + tokens = [] + + pos = ["NUM", "SPACE", "PUNCT"] + for p in keep: + pos.remove(p) + + + # append Tokens to a list + for tok in spacy_doc: + if tok.pos_ not in pos and tok.text not in symbols: + tokens.append(tok.text) + + return " ".join(tokens) + +def cleanText_words(string,parser=PARSER, custom_stopwords=None, custom_words=None, customPreprocessing=cleanText_symbols, lemmatize=False, normalize_synonyms=False): + + # use preprocessing + if customPreprocessing is not None: + string = customPreprocessing(string) + + if custom_stopwords is not None: + custom_stopwords = custom_stopwords + else: + custom_stopwords = [] + + if custom_words is not None: + custom_words = custom_words + else: + custom_words = [] + + + # custom stoplist + # https://stackoverflow.com/questions/9806963/how-to-use-pythons-import-function-properly-import + stop_words = __import__("spacy." + parser.lang, globals(), locals(), ['object']).STOP_WORDS + + stoplist =list(stop_words) + custom_stopwords + + # replace twitter + mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE) + string = mentionFinder.sub("MENTION", string) + + # replace emails + emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE) + string = emailFinder.sub("EMAIL", string) + + # replace urls + urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE) + string = urlFinder.sub("URL", string) + + # replace HTML symbols + string = string.replace("&", "and").replace(">", ">").replace("<", "<") + + + + # parse with spaCy + spacy_doc = parser(string) + tokens = [] + + added_entities = ["WORK_OF_ART","ORG","PRODUCT", "LOC"]#,"PERSON"] + added_POS = ["NOUN"]#, "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013 + + # append Tokens to a list + for tok in spacy_doc: + if tok.pos_ in added_POS: + if lemmatize: + tokens.append(tok.lemma_.lower().strip()) + else: + tokens.append(tok.text.lower().strip()) + + # add entities + if tok.ent_type_ in added_entities: + tokens.append(tok.text.lower()) + + + + # remove stopwords + tokens = [tok for tok in tokens if tok not in stoplist] + + # remove custom_words + tokens = [tok for tok in tokens if tok not in custom_words] + + # remove single characters + tokens = [tok for tok in tokens if len(tok)>1] + + # remove large strings of whitespace + #remove_whitespace(" ".join(tokens)) + + + #idee abkürzungen auflösen (v.a. TU -> Technische Universität) + + if normalize_synonyms: + tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens] + + return " ".join(set(tokens)) + +def remove_whitespace(sentence): + whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE) + sentence = whitespaceFinder.sub(" ", sentence) + return sentence + +def getFirstSynonym(word, thesaurus_gen): + + word = word.lower() + + + # durch den thesaurrus iterieren + for syn_block in thesaurus_gen: # syn_block ist eine liste mit Synonymen + + for syn in syn_block: + syn = syn.lower() + if re.match(r'\A[\w-]+\Z', syn): # falls syn einzelwort ist + if word == syn: + return getHauptform(syn_block, word) + else: # falls es ein satz ist + if word in syn: + return getHauptform(syn_block, word) + return word # zur Not, das ursrpüngliche Wort zurückgeben + +def getHauptform(syn_block, word, default_return_first_Syn=False): + + for syn in syn_block: + syn = syn.lower() + + if "hauptform" in syn and len(syn.split(" ")) <= 2: + # nicht ausgeben, falls es in Klammern steht + for w in syn.split(" "): + if not re.match(r'\([^)]+\)', w): + return w + + if default_return_first_Syn: + # falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht + for w in syn_block: + if not re.match(r'\([^)]+\)', w): + return w + return word # zur Not, das ursrpüngliche Wort zurückgeben + + +def printRandomDoc(textacyCorpus): + print() + + print("len(textacyCorpus) = %i" % len(textacyCorpus)) + randIndex = int((len(textacyCorpus) - 1) * random.random()) + print("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata)) + + print() + ####################'####################'####################'####################'####################'############## - +import de_core_news_md DATAPATH = "ticketSamples.xml" DATAPATH_thesaurus = "openthesaurus.csv" -LANGUAGE = 'de' +normalize_Synonyms = True +clean = True +lemmatize = True + +custom_words = ["grüßen", "fragen"] + ####################'####################'####################'####################'####################'############## -PARSER = spacy.load(LANGUAGE) -THESAURUS_gen = textacy.fileio.read_csv(DATAPATH_thesaurus, delimiter=";") # generator [[a,b,c,..],[a,b,c,..],...] +#PARSER = de_core_news_md.load() + +THESAURUS_list=list(textacy.fileio.read_csv(DATAPATH_thesaurus, delimiter=";")) ## !!!!!! list wichtig, da sonst nicht die gleichen Synonyme zurückgegeben werden, weil der generator während der laufzeit pickt + + ## files to textacy-corpus textacyCorpus = textacy.Corpus(PARSER) print("add texts to textacy-corpus...") -#textacyCorpus.add_texts(texts=generateTextfromXML(DATAPATH), metadatas=generateMetadatafromXML(DATAPATH)) -for txt, dic in generateFromXML(DATAPATH): - textacyCorpus.add_text(txt,dic) +textacyCorpus.add_texts(texts=generateTextfromXML(DATAPATH,normalize_Synonyms=normalize_Synonyms, clean=clean, lemmatize=lemmatize), metadatas=generateMetadatafromXML(DATAPATH)) + + +#for txt, dic in generateFromXML(DATAPATH, normalize_Synonyms=normalize_Synonyms, clean=clean, lemmatize=lemmatize): +# textacyCorpus.add_text(txt,dic) +for doc in textacyCorpus: + print(doc.metadata) + print(doc.text) - -print(textacyCorpus[2].text) +#print(textacyCorpus[2].text) #printRandomDoc(textacyCorpus) #print(textacyCorpus[len(textacyCorpus)-1].text) - - - - - - - - - - - - +print() +print() diff --git a/test.py b/test.py index ec4a3db..201e1c9 100644 --- a/test.py +++ b/test.py @@ -8,90 +8,146 @@ import xml.etree.ElementTree as ET DATAPATH_thesaurus = "openthesaurus.csv" -def generateFromXML(path2xml, clean=True, textfield='Beschreibung'): - import xml.etree.ElementTree as ET - tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8")) - root = tree.getroot() +PARSER = spacy.load('de') - for ticket in root: - metadata = {} - text = "ERROR" - for field in ticket: - if field.tag == textfield: - if clean: - text = (field.text) + + +def cleanText_symbols(string, parser=PARSER, custom_symbols=None, keep=None): + """ + https://spacy.io/docs/usage/pos-tagging + + cleans text from PUNCT, NUM, whitespaces, newlines, and the following list of symbols: + + ["-----","---","...","“","”",".","-","<",">",",","?","!","..","n’t","n't","|","||",";",":","…","’s","'s",".","(",")","[","]","#"] + + """ + if custom_symbols is not None: + custom_symbols = custom_symbols + else: + custom_symbols = [] + + if keep is not None: + keep = keep + else: + keep = [] + + # List of symbols we don't care about + symbols = ["-----","---","...","“","”",".","-","<",">",",","?","!","..","n’t","n't","|","||",";",":","…","’s","'s",".","(",")","[","]","#"] + custom_symbols + + # parse with spaCy + spacy_doc = parser(string) + tokens = [] + + pos = ["NUM", "SPACE", "PUNCT"] + for p in keep: + pos.remove(p) + + + # append Tokens to a list + for tok in spacy_doc: + if tok.pos_ not in pos: + tokens.append(tok.text.lower().strip()) + + + # remove symbols + tokens = [tok for tok in tokens if tok not in symbols] + + # remove whitespace + remove_whitespace(" ".join(tokens)) + + return " ".join(tokens) + +def cleanText_words(string, parser=PARSER, custom_stopwords=None, custom_words=None, customPreprocessing=cleanText_symbols, lemmatize=False, normalize_synonyms=False): + + # use preprocessing + if customPreprocessing is not None: + string = customPreprocessing(string) + + if custom_stopwords is not None: + custom_stopwords = custom_stopwords + else: + custom_stopwords = [] + + if custom_words is not None: + custom_words = custom_words + else: + custom_words = [] + + + # custom stoplist + # https://stackoverflow.com/questions/9806963/how-to-use-pythons-import-function-properly-import + stop_words = __import__("spacy." + parser.lang, globals(), locals(), ['object']).STOP_WORDS + + stoplist =list(stop_words) + custom_stopwords + + # replace twitter + mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE) + string = mentionFinder.sub("MENTION", string) + + # replace emails + emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE) + string = emailFinder.sub("EMAIL", string) + + # replace urls + urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE) + string = urlFinder.sub("URL", string) + + # replace HTML symbols + string = string.replace("&", "and").replace(">", ">").replace("<", "<") + + + + # parse with spaCy + spacy_doc = parser(string) + tokens = [] + + added_entities = ["WORK_OF_ART","ORG","PRODUCT", "LOC"]#,"PERSON"] + added_POS = ["NOUN"]#, "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013 + + # append Tokens to a list + for tok in spacy_doc: + if tok.pos_ in added_POS: + if lemmatize: + tokens.append(tok.lemma_.lower().strip()) else: - text = field.text - else: - metadata[field.tag] = field.text - yield text, metadata + tokens.append(tok.text.lower().strip()) - -def getFirstSynonym(word, thesaurus_gen): - - word = word.lower() - # TODO word cleaning https://stackoverflow.com/questions/3939361/remove-specific-characters-from-a-string-in-python - - - # durch den thesaurrus iterieren - for syn_block in thesaurus_gen: # syn_block ist eine liste mit Synonymen - - # durch den synonymblock iterieren - for syn in syn_block: - syn = syn.lower().split(" ") if not re.match(r'\A[\w-]+\Z', syn) else syn # aus synonym mach liste (um evtl. sätze zu identifieziren) - - # falls das wort in dem synonym enthalten ist (also == einem Wort in der liste ist) - if word in syn: - - # Hauptform suchen - if "Hauptform" in syn: - # nicht ausgeben, falls es in Klammern steht - for w in syn: - if not re.match(r'\([^)]+\)', w) and w is not None: - return w - - # falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht - if len(syn) == 1: - w = syn[0] - if not re.match(r'\([^)]+\)', w) and w is not None: - return w - - return word # zur Not die eingabe ausgeben + # add entities + if tok.ent_type_ in added_entities: + tokens.append(tok.text.lower()) -def getFirstSynonym(word, thesaurus_gen): + # remove stopwords + tokens = [tok for tok in tokens if tok not in stoplist] - word = word.lower() - # TODO word cleaning https://stackoverflow.com/questions/3939361/remove-specific-characters-from-a-string-in-python + # remove custom_words + tokens = [tok for tok in tokens if tok not in custom_words] + + # remove single characters + tokens = [tok for tok in tokens if len(tok)>1] + + # remove large strings of whitespace + #remove_whitespace(" ".join(tokens)) - # durch den thesaurrus iterieren - for syn_block in thesaurus_gen: # syn_block ist eine liste mit Synonymen + #idee abkürzungen auflösen (v.a. TU -> Technische Universität) - for syn in syn_block: + #if normalize_synonyms: + # tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens] - if re.match(r'\A[\w-]+\Z', syn): #falls syn einzelwort ist - if word == syn: - getHauptform(syn_block) + return " ".join(tokens) +def remove_whitespace(sentence): + whitespaceFinder = re.compile(r'(\r\n|\r|\n|\s)', re.IGNORECASE) + sentence = whitespaceFinder.sub(" ", sentence) + return sentence - - -def getHauptform(syn_block): - for s in syn_block: - if "Hauptform" in s: - # nicht ausgeben, falls es in Klammern steht - for w in s: - if not re.match(r'\([^)]+\)', w) and w is not None: - return w - - # falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht - if len(s) == 1: - w = s[0] - if not re.match(r'\([^)]+\)', w) and w is not None: - return w +def cleanText_normalize(string, parser=PARSER, customPreprocessing=cleanText_words, lemmatize=True): + # use preprocessing + if customPreprocessing is not None: + string = customPreprocessing(string) @@ -101,25 +157,9 @@ def getHauptform(syn_block): +string = "Frau Hinrichs überdenkt die Situation und 545453 macht dann neue Anträge. \n Dieses Ticket wird geschlossen \n \n test" +print(cleanText_symbols(string=string, parser=PARSER, keep=["NUM"])) - - - - - - - - -strings = ["passwort",""] -THESAURUS_gen = textacy.fileio.read_csv(DATAPATH_thesaurus, delimiter=";") # generator [[a,b,c,..],[a,b,c,..],...] - -for s in strings: - print(getFirstSynonym(s,THESAURUS_gen)) - - - - - - - +string = "Frau Hinrichs überdenkt die Situation und 545453 macht dann neue Anträge. \n Dieses Ticket wird geschlossen \n \n test" +print(cleanText_symbols(string=string, parser=PARSER, keep=None))