openthesaurus refactored.

2017-08-31 10:38:29 +02:00 · 2017-08-31 10:38:29 +02:00 · 68d8115344
parent bb9edcff25
commit 68d8115344
3 changed files with 111 additions and 95 deletions
--- a/openthesaurus.csv
+++ b/openthesaurus.csv
@ -1,3 +1,4 @@
+Kodewort;Schlüsselwort;Zugangscode;Passwort (Hauptform);Kennwort (Hauptform);Geheimcode;Losung;Codewort;Zugangswort;Losungswort;Parole
 Fission;Kernfission;Kernspaltung;Atomspaltung
 Wiederaufnahme;Fortführung
 davonfahren;abdüsen (ugs.);aufbrechen;abfliegen;abfahren;(von etwas) fortfahren;abreisen;wegfahren;wegfliegen
@ -2182,7 +2183,6 @@ Spitzenklöppel (Handarbeit);Glockenklöppel;Klöppel
 gutartig;benigne (fachspr.)
 Beutelratte;Taschenratte
 rollen;kollern (ugs.);kullern;kugeln
-Kodewort;Schlüsselwort;Zugangscode;Kennwort (Hauptform);Geheimcode;Losung;Codewort;Zugangswort;Passwort (Hauptform);Losungswort;Parole
 packen;einpacken
 Ratschluss;Urteil;Wille;Entscheidung;Entschlossenheit;Beschluss;das letzte Wort (ugs.);Entschluss;Entscheid (schweiz.)
 dreckig machen;versiffen;beschmutzen;verschmutzen
--- a/preprocessing.py
+++ b/preprocessing.py
@ -20,12 +20,14 @@ csv.field_size_limit(sys.maxsize)

 def printRandomDoc(textacyCorpus):
    print()
+
    print("len(textacyCorpus) = %i" % len(textacyCorpus))
    randIndex = int((len(textacyCorpus) - 1) * random.random())
    print("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata))
+
    print()

-
+"""
 def getFirstSynonym(word, thesaurus_gen):

        word = word.lower()
@ -56,10 +58,9 @@ def getFirstSynonym(word, thesaurus_gen):
                            return w

        return word # zur Not die eingabe ausgeben
+"""

-
-def cleanText(string,custom_stopwords=None, custom_symbols=None, custom_words=None, customPreprocessing=None, lemmatize=False):
-    import re
+def cleanText(string,custom_stopwords=None, custom_symbols=None, custom_words=None, customPreprocessing=None, lemmatize=False, normalize_synonyms=False):

    # use preprocessing
    if customPreprocessing is not None:
@ -156,17 +157,14 @@ def cleanText(string,custom_stopwords=None, custom_symbols=None, custom_words=No
        tokens.remove("\n")
    while "\n\n" in tokens:
        tokens.remove("\n\n")
-    """
-    tokenz = []
-    for tok in tokens:
-        tokenz.append(str(getFirstSynonym(tok,THESAURUS_gen)))
-    tokens = tokenz
-    """
-    tokens = [str(getFirstSynonym(tok,THESAURUS_gen)) for tok in tokens]
+
+    #TODO abkürzungen auflösen (v.a. TU -> Technische Universität)
+
+    if normalize_synonyms:
+        tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens]

    return " ".join(tokens)

-
 def generateTextfromXML(path2xml, clean=True, textfield='Beschreibung'):
    import xml.etree.ElementTree as ET

@ -196,7 +194,7 @@ def generateMetadatafromXML(path2xml, keys=["Loesung","Kategorie","Zusammenfassu

        yield metadata

-def generateFromXML(path2xml, clean=True, textfield='Beschreibung'):
+def generateFromXML(path2xml,  textfield='Beschreibung', clean=False, normalize_Synonyms=False):
    import xml.etree.ElementTree as ET

    tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
@ -208,17 +206,55 @@ def generateFromXML(path2xml, clean=True, textfield='Beschreibung'):
        for field in ticket:
            if field.tag == textfield:
                if clean:
-                    text = cleanText(field.text)
+                    text = cleanText(field.text,normalize_synonyms=normalize_Synonyms,lemmatize=False)
                else:
                    text = field.text
            else:
+                #todo hier auch cleanen?
                metadata[field.tag] = field.text
        yield text, metadata

+def getFirstSynonym(word, thesaurus_gen):
+
+    word = word.lower()
+
+
+    # durch den thesaurrus iterieren
+    for syn_block in thesaurus_gen:  # syn_block ist eine liste mit Synonymen
+
+        for syn in syn_block:
+            syn = syn.lower()
+            if re.match(r'\A[\w-]+\Z', syn):  # falls syn einzelwort ist
+                if word == syn:
+                    return getHauptform(syn_block, word)
+            else:  # falls es ein satz ist
+                if word in syn:
+                    return getHauptform(syn_block, word)
+    return word  # zur Not, das ursrpüngliche Wort zurückgeben
+
+
+def getHauptform(syn_block, word, default_return_first_Syn=False):
+
+    for syn in syn_block:
+        syn = syn.lower()
+
+        if "hauptform" in syn and len(syn.split(" ")) <= 2:
+            # nicht ausgeben, falls es in Klammern steht
+            for w in syn.split(" "):
+                if not re.match(r'\([^)]+\)', w):
+                    return w
+
+    if default_return_first_Syn:
+        # falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
+        for w in syn_block:
+            if not re.match(r'\([^)]+\)', w):
+                return w
+    return word  # zur Not, das ursrpüngliche Wort zurückgeben
+

 ####################'####################'####################'####################'####################'##############

-
+import de_core_news_md
 DATAPATH = "ticketSamples.xml"
 DATAPATH_thesaurus = "openthesaurus.csv"

@ -227,8 +263,11 @@ LANGUAGE = 'de'

 ####################'####################'####################'####################'####################'##############

-PARSER = spacy.load(LANGUAGE)
-THESAURUS_gen = textacy.fileio.read_csv(DATAPATH_thesaurus, delimiter=";")  # generator [[a,b,c,..],[a,b,c,..],...]
+PARSER = de_core_news_md.load()#spacy.load(LANGUAGE)
+
+THESAURUS_list=list(textacy.fileio.read_csv(DATAPATH_thesaurus, delimiter=";"))  ## !!!!!! list wichtig, da sonst nicht die gleichen Synonyme zurückgegeben werden, weil der generator während der laufzeit pickt
+
+


 ## files to textacy-corpus
@ -236,27 +275,40 @@ textacyCorpus = textacy.Corpus(PARSER)

 print("add texts to textacy-corpus...")
 #textacyCorpus.add_texts(texts=generateTextfromXML(DATAPATH), metadatas=generateMetadatafromXML(DATAPATH))
-for txt, dic in generateFromXML(DATAPATH):
+for txt, dic in generateFromXML(DATAPATH,normalize_Synonyms=True,clean=True):
    textacyCorpus.add_text(txt,dic)



+for doc in textacyCorpus:
+    print(doc.text)

-
-print(textacyCorpus[2].text)
+#print(textacyCorpus[2].text)
 #printRandomDoc(textacyCorpus)
 #print(textacyCorpus[len(textacyCorpus)-1].text)


-
-
-
-
-
-
-
-
-
+print()
+print()
+
+#################### 1
+
+PARSER = de_core_news_md.load()#spacy.load(LANGUAGE)
+
+## files to textacy-corpus
+textacyCorpus = textacy.Corpus(PARSER)
+
+for txt, dic in generateFromXML(DATAPATH, normalize_Synonyms=False, clean=True):
+    textacyCorpus.add_text(txt,dic)
+
+
+for doc in textacyCorpus:
+    print(doc.text)
+
+
+
+print()
+print()



--- a/test.py
+++ b/test.py
@ -28,95 +28,59 @@ def generateFromXML(path2xml, clean=True, textfield='Beschreibung'):
        yield text, metadata


-def getFirstSynonym(word, thesaurus_gen):
-
-        word = word.lower()
-        # TODO word cleaning https://stackoverflow.com/questions/3939361/remove-specific-characters-from-a-string-in-python
-
-
-        # durch den thesaurrus iterieren
-        for syn_block in thesaurus_gen:  # syn_block ist eine liste mit Synonymen
-
-            # durch den synonymblock iterieren
-            for syn in syn_block:
-                syn = syn.lower().split(" ") if not re.match(r'\A[\w-]+\Z', syn) else syn # aus synonym mach liste (um evtl. sätze zu identifieziren)
-
-                # falls das wort in dem synonym enthalten ist (also == einem Wort in der liste ist)
-                if word in syn:
-
-                    # Hauptform suchen
-                    if "Hauptform" in syn:
-                        # nicht ausgeben, falls es in Klammern steht
-                        for w in syn:
-                            if not re.match(r'\([^)]+\)', w) and w is not None:
-                                return w
-
-                    # falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
-                    if len(syn) == 1:
-                        w = syn[0]
-                        if not re.match(r'\([^)]+\)', w) and w is not None:
-                            return w
-
-        return word # zur Not die eingabe ausgeben
-
-

 def getFirstSynonym(word, thesaurus_gen):

        word = word.lower()
-        # TODO word cleaning https://stackoverflow.com/questions/3939361/remove-specific-characters-from-a-string-in-python
-

        # durch den thesaurrus iterieren
        for syn_block in thesaurus_gen:  # syn_block ist eine liste mit Synonymen

            for syn in syn_block:
-
+                syn = syn.lower()
                if re.match(r'\A[\w-]+\Z', syn):    #falls syn einzelwort ist
                    if word == syn:
-                        getHauptform(syn_block)
+                        return getHauptform(syn_block,word)
+                else:   # falls es ein satz ist
+                    if word in syn:
+                        return getHauptform(syn_block,word)
+        return word #zur Not, das ursrpüngliche Wort zurückgeben




-def getHauptform(syn_block):
-    for s in syn_block:
-        if "Hauptform" in s:
+def getHauptform(syn_block,word,default_return_first_Syn=False):
+
+    for syn in syn_block:
+        syn = syn.lower()
+
+        if "hauptform" in syn:
            # nicht ausgeben, falls es in Klammern steht
-            for w in s:
-                if not re.match(r'\([^)]+\)', w) and w is not None:
+            for w in syn.split(" "):
+                if not re.match(r'\([^)]+\)', w):
                    return w

+    if default_return_first_Syn:
        # falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
-    if len(s) == 1:
-        w = s[0]
-        if not re.match(r'\([^)]+\)', w) and w is not None:
+        for w in syn_block:
+            if not re.match(r'\([^)]+\)', w):
                return w
+    return word  # zur Not, das ursrpüngliche Wort zurückgeben

+THESAURUS_gen = list(textacy.fileio.read_csv(DATAPATH_thesaurus, delimiter=";"))  # generator [[a,b,c,..],[a,b,c,..],...]

-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-strings = ["passwort",""]
-THESAURUS_gen = textacy.fileio.read_csv(DATAPATH_thesaurus, delimiter=";")  # generator [[a,b,c,..],[a,b,c,..],...]
+strings = ["anmachen","Kernspaltung"]
+#strings = ["Kernspaltung","Kennwort"]

 for s in strings:
    print(getFirstSynonym(s,THESAURUS_gen))

+strings = ["Kennwort"]
+#THESAURUS_gen = textacy.fileio.read_csv(DATAPATH_thesaurus, delimiter=";")  # generator [[a,b,c,..],[a,b,c,..],...]
+
+for s in strings:
+    print(getFirstSynonym(s, THESAURUS_gen))
+