diff --git a/openthesaurus.csv b/openthesaurus.csv
index a2348f7..ce336b3 100644
--- a/openthesaurus.csv
+++ b/openthesaurus.csv
@@ -1,3 +1,4 @@
+Kodewort;Schlüsselwort;Zugangscode;Passwort (Hauptform);Kennwort (Hauptform);Geheimcode;Losung;Codewort;Zugangswort;Losungswort;Parole
 Fission;Kernfission;Kernspaltung;Atomspaltung
 Wiederaufnahme;Fortführung
 davonfahren;abdüsen (ugs.);aufbrechen;abfliegen;abfahren;(von etwas) fortfahren;abreisen;wegfahren;wegfliegen
@@ -2182,7 +2183,6 @@ Spitzenklöppel (Handarbeit);Glockenklöppel;Klöppel
 gutartig;benigne (fachspr.)
 Beutelratte;Taschenratte
 rollen;kollern (ugs.);kullern;kugeln
-Kodewort;Schlüsselwort;Zugangscode;Kennwort (Hauptform);Geheimcode;Losung;Codewort;Zugangswort;Passwort (Hauptform);Losungswort;Parole
 packen;einpacken
 Ratschluss;Urteil;Wille;Entscheidung;Entschlossenheit;Beschluss;das letzte Wort (ugs.);Entschluss;Entscheid (schweiz.)
 dreckig machen;versiffen;beschmutzen;verschmutzen
diff --git a/preprocessing.py b/preprocessing.py
index e9f5275..cfc29b5 100644
--- a/preprocessing.py
+++ b/preprocessing.py
@@ -20,12 +20,14 @@ csv.field_size_limit(sys.maxsize)
 
 def printRandomDoc(textacyCorpus):
     print()
+
     print("len(textacyCorpus) = %i" % len(textacyCorpus))
     randIndex = int((len(textacyCorpus) - 1) * random.random())
     print("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata))
+
     print()
 
-
+"""
 def getFirstSynonym(word, thesaurus_gen):
 
         word = word.lower()
@@ -56,10 +58,9 @@ def getFirstSynonym(word, thesaurus_gen):
                             return w
 
         return word # zur Not die eingabe ausgeben
+"""
 
-
-def cleanText(string,custom_stopwords=None, custom_symbols=None, custom_words=None, customPreprocessing=None, lemmatize=False):
-    import re
+def cleanText(string,custom_stopwords=None, custom_symbols=None, custom_words=None, customPreprocessing=None, lemmatize=False, normalize_synonyms=False):
 
     # use preprocessing
     if customPreprocessing is not None:
@@ -156,17 +157,14 @@ def cleanText(string,custom_stopwords=None, custom_symbols=None, custom_words=No
         tokens.remove("\n")
     while "\n\n" in tokens:
         tokens.remove("\n\n")
-    """
-    tokenz = []
-    for tok in tokens:
-        tokenz.append(str(getFirstSynonym(tok,THESAURUS_gen)))
-    tokens = tokenz
-    """
-    tokens = [str(getFirstSynonym(tok,THESAURUS_gen)) for tok in tokens]
+
+    #TODO abkürzungen auflösen (v.a. TU -> Technische Universität)
+
+    if normalize_synonyms:
+        tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens]
 
     return " ".join(tokens)
 
-
 def generateTextfromXML(path2xml, clean=True, textfield='Beschreibung'):
     import xml.etree.ElementTree as ET
 
@@ -196,7 +194,7 @@ def generateMetadatafromXML(path2xml, keys=["Loesung","Kategorie","Zusammenfassu
 
         yield metadata
 
-def generateFromXML(path2xml, clean=True, textfield='Beschreibung'):
+def generateFromXML(path2xml,  textfield='Beschreibung', clean=False, normalize_Synonyms=False):
     import xml.etree.ElementTree as ET
 
     tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
@@ -208,17 +206,55 @@ def generateFromXML(path2xml, clean=True, textfield='Beschreibung'):
         for field in ticket:
             if field.tag == textfield:
                 if clean:
-                    text = cleanText(field.text)
+                    text = cleanText(field.text,normalize_synonyms=normalize_Synonyms,lemmatize=False)
                 else:
                     text = field.text
             else:
+                #todo hier auch cleanen?
                 metadata[field.tag] = field.text
         yield text, metadata
 
+def getFirstSynonym(word, thesaurus_gen):
+
+    word = word.lower()
+
+
+    # durch den thesaurrus iterieren
+    for syn_block in thesaurus_gen:  # syn_block ist eine liste mit Synonymen
+
+        for syn in syn_block:
+            syn = syn.lower()
+            if re.match(r'\A[\w-]+\Z', syn):  # falls syn einzelwort ist
+                if word == syn:
+                    return getHauptform(syn_block, word)
+            else:  # falls es ein satz ist
+                if word in syn:
+                    return getHauptform(syn_block, word)
+    return word  # zur Not, das ursrpüngliche Wort zurückgeben
+
+
+def getHauptform(syn_block, word, default_return_first_Syn=False):
+
+    for syn in syn_block:
+        syn = syn.lower()
+
+        if "hauptform" in syn and len(syn.split(" ")) <= 2:
+            # nicht ausgeben, falls es in Klammern steht
+            for w in syn.split(" "):
+                if not re.match(r'\([^)]+\)', w):
+                    return w
+
+    if default_return_first_Syn:
+        # falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
+        for w in syn_block:
+            if not re.match(r'\([^)]+\)', w):
+                return w
+    return word  # zur Not, das ursrpüngliche Wort zurückgeben
+
 
 ####################'####################'####################'####################'####################'##############
 
-
+import de_core_news_md
 DATAPATH = "ticketSamples.xml"
 DATAPATH_thesaurus = "openthesaurus.csv"
 
@@ -227,8 +263,11 @@ LANGUAGE = 'de'
 
 ####################'####################'####################'####################'####################'##############
 
-PARSER = spacy.load(LANGUAGE)
-THESAURUS_gen = textacy.fileio.read_csv(DATAPATH_thesaurus, delimiter=";")  # generator [[a,b,c,..],[a,b,c,..],...]
+PARSER = de_core_news_md.load()#spacy.load(LANGUAGE)
+
+THESAURUS_list=list(textacy.fileio.read_csv(DATAPATH_thesaurus, delimiter=";"))  ## !!!!!! list wichtig, da sonst nicht die gleichen Synonyme zurückgegeben werden, weil der generator während der laufzeit pickt
+
+
 
 
 ## files to textacy-corpus
@@ -236,27 +275,40 @@ textacyCorpus = textacy.Corpus(PARSER)
 
 print("add texts to textacy-corpus...")
 #textacyCorpus.add_texts(texts=generateTextfromXML(DATAPATH), metadatas=generateMetadatafromXML(DATAPATH))
-for txt, dic in generateFromXML(DATAPATH):
+for txt, dic in generateFromXML(DATAPATH,normalize_Synonyms=True,clean=True):
     textacyCorpus.add_text(txt,dic)
 
 
 
+for doc in textacyCorpus:
+    print(doc.text)
 
-
-print(textacyCorpus[2].text)
+#print(textacyCorpus[2].text)
 #printRandomDoc(textacyCorpus)
 #print(textacyCorpus[len(textacyCorpus)-1].text)
 
 
-
-
-
-
-
-
-
-
-
+print()
+print()
+
+#################### 1
+
+PARSER = de_core_news_md.load()#spacy.load(LANGUAGE)
+
+## files to textacy-corpus
+textacyCorpus = textacy.Corpus(PARSER)
+
+for txt, dic in generateFromXML(DATAPATH, normalize_Synonyms=False, clean=True):
+    textacyCorpus.add_text(txt,dic)
+
+
+for doc in textacyCorpus:
+    print(doc.text)
+
+
+
+print()
+print()
 
 
 
diff --git a/test.py b/test.py
index ec4a3db..f4e8009 100644
--- a/test.py
+++ b/test.py
@@ -28,95 +28,59 @@ def generateFromXML(path2xml, clean=True, textfield='Beschreibung'):
         yield text, metadata
 
 
-def getFirstSynonym(word, thesaurus_gen):
-
-        word = word.lower()
-        # TODO word cleaning https://stackoverflow.com/questions/3939361/remove-specific-characters-from-a-string-in-python
-
-
-        # durch den thesaurrus iterieren
-        for syn_block in thesaurus_gen:  # syn_block ist eine liste mit Synonymen
-
-            # durch den synonymblock iterieren
-            for syn in syn_block:
-                syn = syn.lower().split(" ") if not re.match(r'\A[\w-]+\Z', syn) else syn # aus synonym mach liste (um evtl. sätze zu identifieziren)
-
-                # falls das wort in dem synonym enthalten ist (also == einem Wort in der liste ist)
-                if word in syn:
-
-                    # Hauptform suchen
-                    if "Hauptform" in syn:
-                        # nicht ausgeben, falls es in Klammern steht
-                        for w in syn:
-                            if not re.match(r'\([^)]+\)', w) and w is not None:
-                                return w
-
-                    # falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
-                    if len(syn) == 1:
-                        w = syn[0]
-                        if not re.match(r'\([^)]+\)', w) and w is not None:
-                            return w
-
-        return word # zur Not die eingabe ausgeben
-
-
 
 def getFirstSynonym(word, thesaurus_gen):
 
         word = word.lower()
-        # TODO word cleaning https://stackoverflow.com/questions/3939361/remove-specific-characters-from-a-string-in-python
-
 
         # durch den thesaurrus iterieren
         for syn_block in thesaurus_gen:  # syn_block ist eine liste mit Synonymen
 
             for syn in syn_block:
-
+                syn = syn.lower()
                 if re.match(r'\A[\w-]+\Z', syn):    #falls syn einzelwort ist
                     if word == syn:
-                        getHauptform(syn_block)
+                        return getHauptform(syn_block,word)
+                else:   # falls es ein satz ist
+                    if word in syn:
+                        return getHauptform(syn_block,word)
+        return word #zur Not, das ursrpüngliche Wort zurückgeben
 
 
 
 
-def getHauptform(syn_block):
-    for s in syn_block:
-        if "Hauptform" in s:
+def getHauptform(syn_block,word,default_return_first_Syn=False):
+
+    for syn in syn_block:
+        syn = syn.lower()
+
+        if "hauptform" in syn:
             # nicht ausgeben, falls es in Klammern steht
-            for w in s:
-                if not re.match(r'\([^)]+\)', w) and w is not None:
+            for w in syn.split(" "):
+                if not re.match(r'\([^)]+\)', w):
                     return w
 
-    # falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
-    if len(s) == 1:
-        w = s[0]
-        if not re.match(r'\([^)]+\)', w) and w is not None:
-            return w
+    if default_return_first_Syn:
+        # falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
+        for w in syn_block:
+            if not re.match(r'\([^)]+\)', w):
+                return w
+    return word  # zur Not, das ursrpüngliche Wort zurückgeben
 
+THESAURUS_gen = list(textacy.fileio.read_csv(DATAPATH_thesaurus, delimiter=";"))  # generator [[a,b,c,..],[a,b,c,..],...]
 
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-strings = ["passwort",""]
-THESAURUS_gen = textacy.fileio.read_csv(DATAPATH_thesaurus, delimiter=";")  # generator [[a,b,c,..],[a,b,c,..],...]
+strings = ["anmachen","Kernspaltung"]
+#strings = ["Kernspaltung","Kennwort"]
 
 for s in strings:
     print(getFirstSynonym(s,THESAURUS_gen))
 
+strings = ["Kennwort"]
+#THESAURUS_gen = textacy.fileio.read_csv(DATAPATH_thesaurus, delimiter=";")  # generator [[a,b,c,..],[a,b,c,..],...]
+
+for s in strings:
+    print(getFirstSynonym(s, THESAURUS_gen))
+