diff --git a/aufgaben.txt b/aufgaben.txt
new file mode 100644
index 0000000..43a0cb3
--- /dev/null
+++ b/aufgaben.txt
@@ -0,0 +1,32 @@
+GGrußformeln asm Anfang raus
+
+whitelist (inkl. kb-keywords)
+akronyme & abk. drin lassen
+
+tagging vor normalisierung
+
+groß/klein rumexperimetieren
+
+bigramme nicht auf normtext
+
+relevanz bestimmter wörter
+
+zahlen drin lassen
+
+ticket-subj mit einbeziehen
+
+topics nach lda von itmc bestimmen lassen
+
+baumhieracrchie der categrory einbezihen (ggf. datensatz verbessern)
+
+aktuelle technische bgriffe autoimatisch in whitelist aufnehmen
+
+levenstein/hamming distanz statt autokorrekt (wenn kleiner als x dann ists das gleiche wort)
+
+TODO mittwoch: volltestindizierung (Termhäufigkeiten, bei zahlen vorgänger/nachfolger als ein term)
+
+hautpverb (root) drin lassen
+
+kategroien verkleinern: onthologien/ornamigram
+
+Footer/Header raus
\ No newline at end of file
diff --git a/main.py b/main.py
index cf1c57b..0319a60 100644
--- a/main.py
+++ b/main.py
@@ -31,6 +31,19 @@ start = time.time()
 # todo modelle testen
 
 
+
+
+
+
+
+
+
+
+
+
+
+
+
 logprint("main.py started at {}".format(datetime.now()))
 
 
diff --git a/miscellaneous.py b/miscellaneous.py
index 902cc4d..b493396 100644
--- a/miscellaneous.py
+++ b/miscellaneous.py
@@ -16,6 +16,7 @@ import glob, os
 from textacy.fileio import open_sesame
 import json
 from spacy.tokens.doc import Doc as SpacyDoc
+import operator
 
 csv.field_size_limit(sys.maxsize)
 FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
@@ -124,6 +125,10 @@ def list_from_files(*paths):
 def breakpoint():
     pass
 
+def sort_dictionary(dict):
+    return sorted(dict.items(), key=operator.itemgetter(1))
+
+
 def normalize(string):
     # replaceRockDots
     string = re.sub(r'[ß]', "ss", string.lower())
diff --git a/preprocessing.py b/preprocessing.py
index 422667f..444fcef 100644
--- a/preprocessing.py
+++ b/preprocessing.py
@@ -296,6 +296,7 @@ path2firstnameslist = ressources_path + config.get("firstnames","pickle_file")
 
 
 path2DEstopwordlist = ressources_path + config.get("de_stopwords", "pickle_file")
+
 path2ENstopwordlist = ressources_path + config.get("en_stopwords", "pickle_file")
 
 custom_words = get_list_from_config("preprocessing", "custom_words")
diff --git a/test.py b/test.py
index 37ba096..add5c78 100644
--- a/test.py
+++ b/test.py
@@ -21,18 +21,18 @@ FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
 import draw
 
 
-
+"""
 # load  corpus
 corpus_de_path = FILEPATH + config.get("de_corpus", "path")
 preCorpus_name = "de" + "_pre_ticket"
 corpus, parser = load_corpus(corpus_name=preCorpus_name, corpus_path=corpus_de_path)
 logprint("Corpus loaded: {0}".format(corpus.lang))
+#todo randomize corpus
 
 
 
 
 
-#todo randomize
 
 
 split = 0.8
@@ -64,9 +64,10 @@ model.fit(doc_term_matrix)
 
 
 
+
 compenents = model.model.components_
 
-"""
+
 components_ : array, [n_components, n_features]
 
 Variational parameters for topic word distribution.
@@ -78,9 +79,78 @@ the number of times word j was assigned to topic i.
 It can also be viewed as distribution over the words for each topic after normalization: 
 model.components_ / model.components_.sum(axis=1)[:, np.newaxis].
 
-"""
+
 
 test_doc = corpus_test[0]
+bla = test_doc.to_bag_of_terms(ngrams=1, named_entities=True, normalize=u'lower', lemmatize=None, lowercase=True, weighting=u'count', as_strings=False)
+key_list = bla.keys()
+bla_list = list(bla)
+
+print(bla)
+print(bla_list)
+
+for k in bla.keys():
+    print(id2term[k])
+
+"""
+
+
+"""
+
+ressources_path = FILEPATH +  "ressources/"
+path2DEstopwordlist = ressources_path + config.get("de_stopwords", "pickle_file")
+DE_STOP_WORDS = load_obj(path2DEstopwordlist)
+
+
+
+# load  corpus
+corpus_de_path = FILEPATH + config.get("de_corpus", "path")
+rawCorpus_name = "de" + "_raw_ticket"
+corpus, parser = load_corpus(corpus_name=rawCorpus_name, corpus_path=corpus_de_path)
+
+#parser = spacy.load("de")
+#corpus = textacy.Corpus(parser)
+
+#TODO mittwoch: volltestindizierung (Termhäufigkeiten, bei zahlen vorgänger/nachfolger als ein term)
+"""
+testtxt = "Sehr geehrtes ITMC Service Team,\r\n\r\nseit ein einiger Zeit scheint der Netzwerkanschluss eines Kollegen" \
+          " an das Intranet der BMP mit der Dosennummer G1 303/04/12.05 (G1 4 26-1) in Raum G1-426 nicht mehr zu funktionieren. " \
+          "\r\nIch würde Sie daher bitten diese Mail an den zuständigen Kollegen weiterzuleiten, um die Leitung vielleicht einmal zu Prüfen.\r\n\r\n" \
+          "Des Weiteren hätte ich noch eine Frage bezüglich der Möglichkeit zur Nutzung einer VPN Verbindung aus" \
+          " unserem Intranet heraus zu einem fremden Netzwerk. Dies ist zwar über das WLAN-Netz möglich, jedoch nicht " \
+          "aus unserem Netzwerk heraus. Vielleicht können Sie mir mitteilen an welchen Kollegen ich mich bezüglich" \
+          " dieses Problem wenden kann.\r\n\r\nBei Rückfragen stehe ich gerne zur Verfügung!\r\n\r\nBeste Grüße," \
+          "\r\n\r\nNicolas Rauner\r\n\r\nLS Biomaterialien und Polymerwissenschaften\r\nFakultät Bio- und Chemieingenieurwesen\r\nTU Dortmund" \
+          " \r\nD-44227 Dortmund\r\n\r\nTel: + 49-(0)231 / 755 - 3015\r\nFax: + 49-(0)231 / 755 - 2480\r\n\r\nwww.ls-bmp.de <http://www.ls-bmp.de/>"
+
+#corpus.add_text(testtxt)
+"""
+
+term_dict_w_stop = {}
+term_dict_wo_stop = {}
+footings = ["gruss", "grusse", "gruesse", "gruessen", "grusses"]
+
+for doc in corpus:
+
+    tokens = [tok for tok in doc]
+
+    # footer raus
+
+
+    for i,tok in enumerate(tokens):
+        text = tok.text
+        text = re.sub(r'[ß]', "ss", text)
+        text = re.sub(r'[ö]', "oe", text)
+        text = re.sub(r'[ü]', "ue", text)
+        text = re.sub(r'[ä]', "ae", text)
+
+
+
+        for gr in footings:
+            if gr in text.lower():
+                tokens = tokens[0:i]
+                #print(tokens)
+                break
 
 
 
@@ -88,12 +158,85 @@ test_doc = corpus_test[0]
 
 
 
+    for i,tok in enumerate(tokens):
+
+        if tok.is_space or tok.is_punct or tok.like_url or tok.like_email:
+            continue
+
+
+        if i is not 0:
+            #text = tok.text if tokens[i-1].pos_ is not "NUM" else tok.text+" "+tokens[i-1].text
+
+
+            if tokens[i-1].like_num:
+                text = tokens[i - 1].text + " " + tok.text
+            else:
+                text = tok.text
+
+        else:
+            text = tok.text
+
+
+        # replaceRockDots
+        text = re.sub(r'[ß]', "ss", text)
+        text = re.sub(r'[ö]', "oe", text)
+        text = re.sub(r'[ü]', "ue", text)
+        text = re.sub(r'[ä]', "ae", text)
 
 
 
 
 
 
+        if text not in term_dict_w_stop.keys():
+            term_dict_w_stop[text] = 1
+        else:
+            term_dict_w_stop[text] += 1
+
+
+        if text.lower() not in DE_STOP_WORDS:
+            if text not in term_dict_wo_stop.keys():
+                term_dict_wo_stop[text] = 1
+            else:
+                term_dict_wo_stop[text] += 1
+
+
+
+
+term_dict_sorted = sort_dictionary(term_dict_w_stop)
+term_dict_wo_sorted = sort_dictionary(term_dict_wo_stop)
+
+split_value = 0.2
+from_ = int((1-split_value) * float(len(term_dict_sorted))) #1-splt
+to_ = len(term_dict_sorted)
+
+#logprint(term_dict_sorted[from_: to_])
+#logprint("\n")
+#logprint(term_dict_wo_sorted[from_: to_])
+
+
+for elem in term_dict_sorted:
+    logprint(elem)
+
+logprint("\n")
+logprint("\n")
+logprint("\n")
+logprint("\n")
+
+for elem in term_dict_wo_sorted:
+    logprint(elem)
+
+"""
+
+
+in_path= "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/log/terms_without_stop.txt"
+out_path= "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/log/terms_without_stopwords.txt"
+
+gen=reversed(list(open(in_path)))
+
+
+textacy.fileio.write_file_lines(gen,out_path)
+
 
 
 
@@ -149,27 +292,6 @@ print("\n\n\nTime Elapsed Test:{0}\n\n".format(end - start))
 
 
 
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
 """