diff --git a/test.py b/test.py
index 6992c3c..8820f61 100644
--- a/test.py
+++ b/test.py
@@ -456,7 +456,7 @@ vllt kategorien in unterkategorien aufteilen
 
 allg: 
 utf-korregieren, bei sonderzeichen wörter trennen
-namen raus
+namen raus, addressen nach grüßen
 
 emails, urls, nummern raus 
 vllt sogar alles, was ebend jenes enthält (oder auf .toplvldomain bzw. sonderzeichen enthält oder alles was ein @ enthält
diff --git a/testo.py b/testo.py
index d6e9e9d..a88bb85 100644
--- a/testo.py
+++ b/testo.py
@@ -187,6 +187,13 @@ def remove_short_words():
 def remove_first_names():
     return lambda tok: tok.lower_ not in [name.lower() for name in VORNAMEN]
 
+
+
+#falls wort nicht in vocab, erst schauen ob teilwort bekannt ist, falls ja, schauen ob es davor oder danach bullshit stehen hat. ggf trennen
+
+
+
+
 ############# strings
 
 def stringcleaning(stringstream, funclist):
@@ -196,6 +203,10 @@ def stringcleaning(stringstream, funclist):
         yield string
 
 
+def seperate_words_on_regex(regex=regex_specialChars):
+    return lambda string: " ".join(re.compile(regex).split(string))
+
+
 def remove_words_containing_topLVL():
     return lambda string: " ".join([w.lower() for w in string.split() if not re.search(regex_topLvl, w) ])
 
@@ -212,10 +223,15 @@ def replacePhonenumbers(replace_with="PHONENUMBER"):
     return lambda string: textacy.preprocess.replace_phone_numbers(string.lower(), replace_with=replace_with)
 
 
-def replaceHardS(replace_with="ss"):
+def replaceSharpS(replace_with="ss"):
     return lambda string: re.sub(r'[ß]',replace_with,string.lower())
 
 
+def replaceRockDots():
+    return lambda string: re.sub(r'[ß]', "ss", (re.sub(r'[ö]', "oe", (re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower())))))))
+
+
+
 def fixUnicode():
     return lambda string: textacy.preprocess.fix_bad_unicode(string.lower(), normalization=u'NFC')
 
@@ -230,28 +246,36 @@ def lemmatizeWord(word,filepath=LEMMAS):
 
 def lemmatize():
     #todo https://alpha.spacy.io/docs/usage/adding-languages#lemmatizer
-    return lambda tok: lemmatizeWord(tok.lower_)
+    return lambda doc: " ".join([lemmatizeWord(tok.lower_) for tok in doc])
 
 
-def processTextstream(textstream, string_funclist, tok_funclist,parser=DE_PARSER):
+def processTextstream(textstream, string_funclist, tok_funclist, parser=DE_PARSER, single_doc_func=None):
     """
     :param textstream: string-gen
     :param funclist: [func]
     :param parser: spacy-parser
     :return: string-gen
     """
+    #zuerst die string-methoden
     pipe = parser.pipe(stringcleaning(textstream,string_funclist))
-
+    tokens=[]
     for doc in pipe:
 
         tokens = [tok for tok in doc]
 
-        tokens = processTokens(tokens,tok_funclist,parser)
+        #dann die auf tokens
+        tokens = processTokens(tokens,tok_funclist)
 
-        yield " ".join([tok.lower_ for tok in tokens])
+        if single_doc_func is not None:
+            yield single_doc_func(parser(" ".join([tok.lower_ for tok in tokens])))
+        else:
+            yield " ".join([tok.lower_ for tok in tokens])
 
 
-def processTokens(tokens, funclist, parser):
+
+
+
+def processTokens(tokens, funclist):
     # in:tokenlist, funclist
     # out: tokenlist
     for f in funclist:
@@ -261,25 +285,33 @@ def processTokens(tokens, funclist, parser):
     return tokens
 
 
+
+
 string_comp=[
-    replaceHardS(),
+    fixUnicode(),
+    replaceRockDots(),
     remove_words_containing_topLVL(),
-    replaceSpecialChars(),
+    seperate_words_on_regex()
 ]
 
 tok_comp=[
-    removeENT(["PERSON"]),
+    #removeENT(["PERSON"]),
     remove_words_containing_Numbers(),
-    #keepPOS(["NOUN"]),
     removePOS(["PUNCT","SPACE","NUM"]),
     removeWords(de_stop_words),
 
     remove_long_words(),
     remove_short_words(),
 
-    remove_first_names()
+    remove_first_names(),
+
+    #keepPOS(["NOUN"]),
+
 ]
 
+single_doc_func = lemmatize()
+
+
 
 
 """
@@ -332,7 +364,7 @@ ticketcorpus = textacy.Corpus(DE_PARSER)
 ## add files to textacy-corpus,
 printlog("add texts to textacy-corpus")
 ticketcorpus.add_texts(
-    processTextstream(csv_to_contentStream(path2csv,"Description"), string_funclist=string_comp,tok_funclist=tok_comp)
+    processTextstream(csv_to_contentStream(path2csv,"Description"), string_funclist=string_comp, tok_funclist=tok_comp, single_doc_func=single_doc_func)
 )
 
 for i in range(10):
@@ -362,33 +394,6 @@ for i in range(10):
 
 
 
-
-
-
-
-words = [
-    "uniaccount",
-    "nr54065467",
-    "nr54065467",
-    "455a33c5,"
-    "tvt?=",
-    "tanja.saborowski@tu-dortmund.de",
-    "-",
-    "m-sw1-vl4053.itmc.tu-dortmund.de",
-    "------problem--------"
-]
-
-topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE)
-specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE)
-
-for w in words:
-    print(stringcleaning(w,string_comp))
-    #print(bool(re.search(r'\.[a-z]{2,3}(\.[a-z]{2,3})?',w)))
-    #print(bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]',w)))
-    #result = specialFinder.sub(" ", w)
-    #print(re.sub(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]'," ",w))
-
-    #print(re.sub(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', " ", w))
 
 """
 
diff --git a/testra.py b/testra.py
new file mode 100644
index 0000000..2850c1f
--- /dev/null
+++ b/testra.py
@@ -0,0 +1,93 @@
+# -*- coding: utf-8 -*-
+import corenlp as corenlp
+import os
+import re
+import textacy
+import nltk
+from textblob_de import TextBlobDE
+from textblob_de import PatternParser
+
+filepath = "lemmatization-de.txt"
+
+
+
+
+blob = TextBlobDE(str(textacy.fileio.read_file("teststring.txt")),parser=PatternParser(pprint=True, lemmata=True))
+
+print(blob.parse())
+
+
+
+
+
+#erste spalte zu {lemma : id} . zweite spalte zu {word : id}
+
+
+
+"""http://www.lexiconista.com/datasets/lemmatization/"""
+
+lemma2id = {}
+word2id = {}
+
+for id,line in enumerate(list(textacy.fileio.read_file_lines(filepath=filepath))):
+
+    lemma = line.split()[0].strip().lower()
+    if lemma not in lemma2id:
+        lemma2id[lemma] = id
+
+    word = line.split()[1].strip().lower()
+
+    word2id[word] = lemma2id[word]
+
+
+
+
+
+
+
+
+
+
+"""
+regex_specialChars = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|</>?]'
+
+
+def stringcleaning(stringstream, funclist):
+    for string in stringstream:
+        for f in funclist:
+            string = f(string)
+        yield string
+
+
+def seperate_words_on_regex(regex=regex_specialChars):
+    return lambda string: " ".join(re.compile(regex).split(string))
+
+
+words = [
+    "uniaccount",
+    "nr54065467",
+    "nr54065467",
+    "455a33c5,"
+    "tvt?=",
+    "tanja.saborowski@tu-dortmund.de",
+    "-",
+    "m-sw1-vl4053.itmc.tu-dortmund.de",
+    "------problem--------"
+]
+
+
+
+topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE)
+specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE)
+
+for s in stringcleaning((w for w in words),[seperate_words_on_regex()]):
+    print(s.strip())
+
+    #print(stringcleaning(w,string_comp))
+    #print(bool(re.search(r'\.[a-z]{2,3}(\.[a-z]{2,3})?',w)))
+    #print(bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]',w)))
+    #result = specialFinder.sub(" ", w)
+    #print(re.sub(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]'," ",w))
+
+    #print(re.sub(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', " ", w))
+"""
\ No newline at end of file