From 26c0f37ec8936fccfb679412359341a310f0658b Mon Sep 17 00:00:00 2001
From: "jannis.grundmann" <jannis.grundmann@tu-dortmund.de>
Date: Wed, 13 Sep 2017 12:53:09 +0200
Subject: [PATCH] =?UTF-8?q?tickets.csv=20kann=20in=20corpus=20=C3=BCbertra?=
 =?UTF-8?q?gen=20werden?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 M42-Export/Tickets_small.csv | 169 +++++++++++++++++
 test.py                      | 350 +++++++++++++++++++++++++----------
 2 files changed, 422 insertions(+), 97 deletions(-)
 create mode 100644 M42-Export/Tickets_small.csv

diff --git a/M42-Export/Tickets_small.csv b/M42-Export/Tickets_small.csv
new file mode 100644
index 0000000..7936a66
--- /dev/null
+++ b/M42-Export/Tickets_small.csv
@@ -0,0 +1,169 @@
+﻿"TicketNumber";"Subject";"CreatedDate";"categoryName";"Impact";"Urgency";"BenutzerID";"VerantwortlicherID";"EigentuemerID";"Description";"Solution"
+"INC20357";"schulungstest";"21.07.2015 08:19:34";"ZHB";"2 - Mittel (Abt./Bereich)";"B - Normal";"aa8315f5-52c3-e411-80c7-0050569c58f5";"";"aa8315f5-52c3-e411-80c7-0050569c58f5";"kevin arbeite gefälligst :)";""
+"INC40481";"Telephone Contract";"13.08.2015 14:18:57";"Neuanschluss";"2 - Mittel (Abt./Bereich)";"B - Normal";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"Telefon-Neuanschluss
+Antragsteller:
+Melanie Hinrichs 
+melanie.hinrichs@tu-dortmund.de 
+　
+　
+　
+Terminvorschlag unbestimmt 
+﻿"TicketNumber";"Subject";"CreatedDate";"categoryName";"Impact";"Urgency";"BenutzerID";"VerantwortlicherID";"EigentuemerID";"Description";"Solution"
+"INC20357";"schulungstest";"21.07.2015 08:19:34";"ZHB";"2 - Mittel (Abt./Bereich)";"B - Normal";"aa8315f5-52c3-e411-80c7-0050569c58f5";"";"aa8315f5-52c3-e411-80c7-0050569c58f5";"kevin arbeite gefälligst :)";""
+"INC40481";"Telephone Contract";"13.08.2015 14:18:57";"Neuanschluss";"2 - Mittel (Abt./Bereich)";"B - Normal";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"Telefon-Neuanschluss
+Antragsteller:
+Melanie Hinrichs 
+melanie.hinrichs@tu-dortmund.de 
+　
+　
+　
+Terminvorschlag unbestimmt 
+Einrichtung Dezernat 3 
+Abteilung Abteilung 2 
+PSP Element L-11-10000-100-302300 
+UniAccount myvowest(Westerdorf, Yvonne) 
+Gebäude Pavillon 8 
+Raum ID 031 (63292) 
+Telefondose keine vorhanden 
+Telefonnr. - 
+Eintrag Telefonbuch 
+E-Mail melanie.hinrichs@tu-dortmund.de 
+Voicemail Nicht erwünscht 
+Ansprechpartner Melanie Hinrichs 
+Tel. Ansprechpartner 5848 
+Verantwortlicher Nutzer - 
+Type Amt
+Bemerkung:
+Es wird ein Telefon benötigt,ein Telefon mit 6 Speicherpl.f.die Gruppenfunktion ist ausreichend. Die Möbel werden am 10.06.2015 aufgestellt.Weder Netzwerkdose noch Telefondose vorhanden. Dez.6 hat Vorbereitungen getroffen.";"Frau Hinrichs überdenkt die Situation und macht dann neue Anträge.
+Dieses Ticket wird geschlossen"
+"INC40483";"Telephone Contract";"13.08.2015 14:22:06";"Neuanschluss";"2 - Mittel (Abt./Bereich)";"B - Normal";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"Telefon-Neuanschluss
+Antragsteller:
+Anja Kulmsee 
+anja.kulmsee@tu-dortmund.de 
+　
+　
+　
+Terminvorschlag 03.08.2015 
+Einrichtung Fk06 Dekanat 
+Abteilung Bereich Studium und Lehre 
+PSP Element L-11-10000-100-060011 
+UniAccount manjkulm(Kulmsee, Anja) 
+Gebäude CT Geschossbau 2 
+Raum ID G2-3.22 (64882) 
+Telefondose 
+Telefonnr. - 
+Eintrag Telefonbuch 
+E-Mail anja.kulmsee@tu-dortmund.de 
+Voicemail Nicht erwünscht 
+Ansprechpartner Anja Kulmsee 
+Tel. Ansprechpartner 6179, 7370, 7179 
+Verantwortlicher Nutzer - 
+Type Amt
+Bemerkung:
+Der Anschluß ist für ein Faxgerät. Wenn möglich hätte ich gern die Rufnummer 3033.";"Faxnummer 3166 wurde unter die Telefonnummer 7179 im elektronischen Telefonbuch eingetragen"
+"INC40484";"Defekte Netzwerkdose / Frage zu VPN";"13.08.2015 14:25:50";"LAN";"2 - Mittel (Abt./Bereich)";"B - Normal";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"Sehr geehrtes ITMC Service Team,
+
+seit ein einiger Zeit scheint der Netzwerkanschluss eines Kollegen an das Intranet der BMP mit der Dosennummer G1 303/04/12.05 (G1 4 26-1) in Raum G1-426 nicht mehr zu funktionieren. 
+Ich würde Sie daher bitten diese Mail an den zuständigen Kollegen weiterzuleiten, um die Leitung vielleicht einmal zu Prüfen.
+
+Des Weiteren hätte ich noch eine Frage bezüglich der Möglichkeit zur Nutzung einer VPN Verbindung aus unserem Intranet heraus zu einem fremden Netzwerk. Dies ist zwar über das WLAN-Netz möglich, jedoch nicht aus unserem Netzwerk heraus. Vielleicht können Sie mir mitteilen an welchen Kollegen ich mich bezüglich dieses Problem wenden kann.
+
+Bei Rückfragen stehe ich gerne zur Verfügung!
+
+Beste Grüße,
+
+Nicolas Rauner
+
+LS Biomaterialien und Polymerwissenschaften
+Fakultät Bio- und Chemieingenieurwesen
+TU Dortmund 
+D-44227 Dortmund
+
+Tel: + 49-(0)231 / 755 - 3015
+Fax: + 49-(0)231 / 755 - 2480
+
+www.ls-bmp.de <http://www.ls-bmp.de/>";"Hallo Herr Rauner,
+die Netzwerkdose  weist z. Z. keine Verbindungsprobleme auf. Falls doch welche bestehen, melden Sie sich bitte bei uns.
+ 
+Mit freunldichen Grüßen
+Aicha Oikrim"
+"INC40487";"(SSO) Login via Browser mit Zertifikat";"13.08.2015 14:54:57";"Betrieb";"2 - Mittel (Abt./Bereich)";"B - Normal";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"Lieber Support,
+ich habe gerade versucht mich mit meiner Unicard im Firefox-Browser für das
+Service-Portal zu authentifizieren. Das hat vor einigen Wochen noch tadelos
+geklappt und mittlerweile bekomme ich folgende Fehlermeldung:
+　
+　
+
+Ich hoffe Sie können mir weiterhelfen.
+
+Vielen Dank und viele Grüße
+Sascha Feldhorst
+
+Dipl.-Inform.
+Sascha Feldhorst
+Wiss.-Ang.
+
+Technische Universität Dortmund
+Maschinenbau/Lehrstuhl für Förder- und Lagerwesen
+LogistikCampus
+Joseph-von-Fraunhofer-Str. 2-4
+D-44227 Dortmund
+
+Tel.: +49 231-755 40 73
+Fax: +49 231-755 47 68
+<mailto:sascha.feldhorst@tu-dortmund.de> sascha.feldhorst@tu-dortmund.de
+<http://www.flw.mb.tu-dortmund.de/> www.flw.mb.tu-dortmund.de
+
+Wichtiger Hinweis: Die Information in dieser E-Mail ist vertraulich. Sie ist
+ausschließlich für den Adressaten bestimmt. Sollten Sie nicht der für diese
+E-Mail bestimmte Adressat sein, unterrichten Sie bitte den Absender und
+vernichten Sie diese Mail. Vielen Dank. Unbeschadet der Korrespondenz per
+E-Mail, sind unsere Erklärungen ausschließlich final rechtsverbindlich, wenn
+sie in herkömmlicher Schriftform (mit eigenhändiger Unterschrift) oder durch
+Übermittlung eines solchen Schriftstücks per Telefax erfolgen.
+
+Important note: The information included in this e-mail is confidential. It
+is solely intended for the recipient. If you are not the intended recipient
+of this e-mail please contact the sender and delete this message. Thank you.
+Without prejudice of e-mail correspondence, our statements are only legally
+binding when they are made in the conventional written form (with personal
+signature) or when such documents are sent by fax.";"der Login via Zertifikat am SSO-Dienst mittels Firefox und UniCard sollte funktionieren. 
+Eventuell wurden durch ein Browserupdate die Einstellungen gelöscht. Bitte prüfen Sie ob die CA-Zertifikate installiert sind:
+https://pki.pca.dfn.de/tu-dortmund-chipcard-ca/cgi-bin/pub/pki?cmd=getStaticPage;name=index;id=2&RA_ID=0 ""https://pki.pca.dfn.de/tu-dortmund-chipcard-ca/cgi-bin/pub/pki?cmd=getStaticPage;name=index;id=2&RA_ID=0""
+und ob das Kryptographie Modul im Firefox hinterlegt ist:
+https://service.tu-dortmund.de/group/intra/authentifizierung"
+"INC40489";"Telephone Contract";"13.08.2015 14:57:23";"Elektronisches Telefonbuch";"2 - Mittel (Abt./Bereich)";"B - Normal";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"Telefon-Umzug
+Antragsteller:
+Astrid Gramm 
+astrid.gramm@tu-dortmund.de 
+　
+　
+　
+Terminvorschlag 14.08.2015 
+Einrichtung Dezernat 2 
+Abteilung 2.5 
+PSP Element 
+UniAccount mnichofm(Hofmann, Nicole) 
+Gebäude Dezernat 5 
+Raum ID 201 (651430) 
+Telefondose Neztwerkdose: DT04.5/04.6 
+Telefonnr. 4821 
+Eintrag Telefonbuch 
+E-Mail astrid.gramm@tu-dortmund.de 
+Voicemail 
+Ansprechpartner Astrid Gramm 
+Tel. Ansprechpartner 5444 
+Verantwortlicher Nutzer 
+Type 
+Bemerkung:
+Frau Hofmann wird am 14.08.2015 in die WD 2 umziehen. Es ist der Raum 201a im OG (nicht 201)
+Eine Bezeichnung der Telefondose ist nicht vorhanden.";"erledigt"
+"INC40488";"Laptop macht komische Geräusche";"13.08.2015 14:56:24";"Verwaltung";"2 - Mittel (Abt./Bereich)";"B - Normal";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"9668e0af-7202-e711-0781-005056b025d0";"Hallo,
+mein Laptop macht seit eben komische Geräusche.
+Bitte um Klärung.
+Jan Hustadt
+(0231) 755-7248
+WD2, R. 112
+Dezernat 2 Hochschulentwicklung
+Abteilung 2.3 Organisationsentwicklung
+E-Mail: jan.hustadt@tu-dortmund.de";"Herr Alexev Swetlomier (HIWI) küümert sich bereits um das Laptop und Frau Herbst weiß auch Bescheid die zur Zeit im Urlaub ist"
diff --git a/test.py b/test.py
index 8c89e07..15c95ab 100644
--- a/test.py
+++ b/test.py
@@ -1,4 +1,8 @@
 # -*- coding: utf-8 -*-
+import time
+start = time.time()
+
+
 import csv
 import functools
 import os.path
@@ -11,23 +15,65 @@ import spacy
 import textacy
 from scipy import *
 from textacy import Vectorizer
-
+import warnings
 csv.field_size_limit(sys.maxsize)
 
 
 
-path2xml = "ticket.xml"
-import de_core_news_md
+# Load the configuration file
+import configparser as ConfigParser
+config = ConfigParser.ConfigParser()
+with open("config.ini") as f:
+    config.read_file(f)
 
 
-PARSER = de_core_news_md.load()
-corpus = textacy.Corpus(PARSER)
-thesauruspath = "openthesaurus.csv"
+
+path2xml = config.get("default","path2xml")
+thesauruspath = config.get("default","thesauruspath")
+
+
+DE_PARSER = spacy.load("de")
+
+de_stop_words=list(__import__("spacy." + DE_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)
+
+
+corpus = textacy.Corpus(DE_PARSER)
+
 THESAURUS = list(textacy.fileio.read_csv(thesauruspath, delimiter=";"))
 
 
 
 
+############# misc
+def compose(*functions):
+    def compose2(f, g):
+        return lambda x: f(g(x))
+    return functools.reduce(compose2, functions, lambda x: x)
+
+def get_calling_function():
+    """finds the calling function in many decent cases.
+    https://stackoverflow.com/questions/39078467/python-how-to-get-the-calling-function-not-just-its-name
+    """
+    fr = sys._getframe(1)   # inspect.stack()[1][0]
+    co = fr.f_code
+    for get in (
+        lambda:fr.f_globals[co.co_name],
+        lambda:getattr(fr.f_locals['self'], co.co_name),
+        lambda:getattr(fr.f_locals['cls'], co.co_name),
+        lambda:fr.f_back.f_locals[co.co_name], # nested
+        lambda:fr.f_back.f_locals['func'],  # decorators
+        lambda:fr.f_back.f_locals['meth'],
+        lambda:fr.f_back.f_locals['f'],
+        ):
+        try:
+            func = get()
+        except (KeyError, AttributeError):
+            pass
+        else:
+            if func.__code__ == co:
+                return func
+    raise AttributeError("func not found")
+
 def printRandomDoc(textacyCorpus):
     import random
     print()
@@ -40,6 +86,9 @@ def printRandomDoc(textacyCorpus):
 
 
 
+
+
+#############  on xml
 def generateMainTextfromTicketXML(path2xml, main_textfield='Beschreibung'):
     """
     generates strings from XML
@@ -55,7 +104,6 @@ def generateMainTextfromTicketXML(path2xml, main_textfield='Beschreibung'):
         for field in ticket:
             if field.tag == main_textfield:
                     yield field.text
-
 def generateMetadatafromTicketXML(path2xml, leave_out=['Beschreibung']):
     tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
     root = tree.getroot()
@@ -71,53 +119,161 @@ def generateMetadatafromTicketXML(path2xml, leave_out=['Beschreibung']):
 
 
 
+#############  on csv
 
-def processTextstream(textstream, funclist, parser=PARSER):
+def csv_to_contentStream(path2csv: str, content_collumn_name: str):
+    """
+    :param path2csv: string
+    :param content_collumn_name: string
+    :return: string-generator
+    """
+    stream = textacy.fileio.read_csv(path2csv, delimiter=";")  # ,encoding='utf8')
+    content_collumn = 0  # standardvalue
+
+    for i,lst in enumerate(stream):
+        if i == 0:
+            # look for desired column
+            for j,col in enumerate(lst):
+                if col == content_collumn_name:
+                    content_collumn = j
+        else:
+            yield lst[content_collumn]
+def csv_to_metaStream(path2csv: str, metalist: [str]):
+    """
+    :param path2csv: string
+    :param metalist: list of strings
+    :return: dict-generator
+    """
+    stream = textacy.fileio.read_csv(path2csv, delimiter=";")  # ,encoding='utf8')
+
+    content_collumn = 0  # standardvalue
+    metaindices = []
+    metadata_temp = {}
+    for i,lst in enumerate(stream):
+        if i == 0:
+            for j,col in enumerate(lst):        # geht bestimmt effizienter... egal, weil passiert nur einmal
+                for key in metalist:
+                    if key == col:
+                        metaindices.append(j)
+            metadata_temp = dict(zip(metalist,metaindices)) # zB {'Subject' : 1, 'categoryName' : 3, 'Solution' : 10}
+
+        else:
+            metadata = metadata_temp.copy()
+            for key,value in metadata.items():
+                metadata[key] = lst[value]
+            yield metadata
+
+
+#############  on str-gen
+
+
+
+
+def processTokens(tokens, funclist, parser):
+    # in:tokenlist, funclist
+    # out: tokenlist
+    for f in funclist:
+        if 'bool' in str(f.__annotations__):
+            tokens = list(filter(f, tokens))
+
+        elif 'str' in str(f.__annotations__):
+            tokens = list(map(f, tokens))  # purer text
+            doc = parser(" ".join(tokens))  # geparsed
+            tokens = [tok for tok in doc]  # nur tokens
+
+        elif 'spacy.tokens.doc.Doc' in str(f.__annotations__):
+            toks = f(tokens)
+            tokens = [tok for tok in toks]
+
+        else:
+            warnings.warn("Unknown Annotation while preprocessing. Function: {0}".format(str(f)))
+
+    return tokens
+
+
+
+
+#############  return docs
+
+def keepUniqueTokens() -> spacy.tokens.Doc:
+    #todo in:tok out:doc
+    ret = lambda doc: (set([tok.lower_ for tok in doc]))
+
+    ret.__annotations__ = get_calling_function().__annotations__
+    return ret
+
+
+
+
+
+
+
+
+
+
+
+def processTextstream(textstream, funclist, parser=DE_PARSER):
+    """
+    :param textstream: string-gen
+    :param funclist: [func]
+    :param parser: spacy-parser
+    :return: string-gen
+    """
     # input:str-stream output:str-stream
     pipe = parser.pipe(textstream)
 
     for doc in pipe:
         tokens = [tok for tok in doc]
-        for f in funclist:
-            if 'bool' in str(f.__annotations__):
-                tokens = list(filter(f,tokens))
-
-            elif 'str' in str(f.__annotations__):
-                x=0
-                tokens = list(map(f, tokens))
-                #tokens = [f(tok.lower_) for tok in tokens]  #purer text
-                doc = parser(" ".join(tokens))    #geparsed
-                tokens = [tok for tok in doc]   #nur tokens
-
-            elif 'spacy.tokens.Doc' in str(f.__annotations__):
-                tokens = [tok for tok in f(tokens)]
-
-
-
+        tokens = processTokens(tokens,funclist,parser)
         yield " ".join([tok.lower_ for tok in tokens])
 
-def processDictstream(dictstream, funcdict, parser=PARSER): #todo das selbe wie mit textstream idee: processDoc(doc,funcs)
+def processDictstream(dictstream, funcdict, parser=DE_PARSER):
+    """
+
+    :param dictstream: dict-gen
+    :param funcdict:
+                    clean_in_meta = {
+                        "Solution":funclist,
+                        ...
+                    }
+
+    :param parser: spacy-parser
+    :return: dict-gen
+    """
     for dic in dictstream:
         result = {}
         for key, value in dic.items():
+
             if key in funcdict:
-                result[key] = funcdict[key](parser(value))
+
+                doc = parser(value)
+                tokens = [tok for tok in doc]
+                funclist = funcdict[key]
+
+                tokens = processTokens(tokens,funclist,parser)
+
+
+                result[key] = " ".join([tok.lower_ for tok in tokens])
+
+
             else:
                 result[key] = value
         yield result
 
 
 
+#############  return tokens
+
 def keepPOS(pos_list) -> bool:
     ret = lambda tok : tok.pos_ in pos_list
 
-    ret.__annotations__ = keepPOS.__annotations__
+    ret.__annotations__ = get_calling_function().__annotations__
     return ret
 
 def removePOS(pos_list)-> bool:
     ret = lambda tok : tok.pos_ not in pos_list
 
-    ret.__annotations__ = removePOS.__annotations__
+    ret.__annotations__ = get_calling_function().__annotations__
     return ret
 
 def removeWords(words, keep=None)-> bool:
@@ -131,86 +287,32 @@ def removeWords(words, keep=None)-> bool:
 
     ret = lambda tok :  tok.lower_ not in words
 
-    ret.__annotations__ = removeWords.__annotations__
+    ret.__annotations__ = get_calling_function().__annotations__
     return ret
 
 def keepENT(ent_list) -> bool:
     ret = lambda tok : tok.ent_type_ in ent_list
 
-    ret.__annotations__ = keepENT.__annotations__
+    ret.__annotations__ = get_calling_function().__annotations__
     return ret
 
 def removeENT(ent_list) -> bool:
     ret = lambda tok: tok.ent_type_ not in ent_list
 
-    ret.__annotations__ = removeENT.__annotations__
+    ret.__annotations__ = get_calling_function().__annotations__
     return ret
 
-
-
-def keepUniqueTokens() -> spacy.tokens.Doc:
-    ret = lambda doc: (set([tok.lower_ for tok in doc]))
-
-    ret.__annotations__ = keepUniqueTokens.__annotations__
-    return ret
-
-
 def lemmatize() -> str:
     ret = lambda tok:  tok.lemma_
 
-    ret.__annotations__ = lemmatize.__annotations__
+    ret.__annotations__ = get_calling_function().__annotations__
     return ret
 
 
-
-
-mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
-emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
-urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
-
-def replaceEmails(replace_with="EMAIL") -> str:
-    ret = lambda tok : emailFinder.sub(replace_with, tok.lower_)
-
-    ret.__annotations__ = replaceEmails.__annotations__
-    return ret
-
-def replaceURLs(replace_with="URL") -> str:
-    ret = lambda tok: textacy.preprocess.replace_urls(tok.lower_,replace_with=replace_with)
-    #ret = lambda tok: urlFinder.sub(replace_with,tok.lower_)
-
-    ret.__annotations__ = replaceURLs.__annotations__
-    return ret
-
-def replaceTwitterMentions(replace_with="TWITTER_MENTION") -> str:
-    ret = lambda tok : mentionFinder.sub(replace_with,tok.lower_)
-
-    ret.__annotations__ = replaceTwitterMentions.__annotations__
-    return ret
-
-def replaceNumbers(replace_with="NUMBER") -> str:
-    ret = lambda tok: textacy.preprocess.replace_numbers(tok.lower_, replace_with=replace_with)
-
-    ret.__annotations__ = replaceNumbers.__annotations__
-    return ret
-
-def replacePhonenumbers(replace_with="PHONENUMBER",parser=PARSER):
-    ret = lambda tok: textacy.preprocess.replace_phone_numbers(tok.lower_, replace_with=replace_with)
-
-    ret.__annotations__ = replacePhonenumbers.__annotations__
-    return ret
-
-
-def resolveAbbreviations():
-    pass #todo
-
-
-
-
-
 def normalizeSynonyms(default_return_first_Syn=False) -> str:
     ret = lambda tok : getFirstSynonym(tok.lower_, default_return_first_Syn=default_return_first_Syn)
 
-    ret.__annotations__ = normalizeSynonyms.__annotations__
+    ret.__annotations__ = get_calling_function().__annotations__
     return ret
 
 def getFirstSynonym(word, thesaurus=THESAURUS, default_return_first_Syn=False):
@@ -251,20 +353,71 @@ def getHauptform(syn_block, word, default_return_first_Syn=False):
 
 
 
-stop_words=list(__import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)
+#############  return strings
 
+mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
+emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
+urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
+
+def replaceEmails(replace_with="EMAIL") -> str:
+    ret = lambda tok : emailFinder.sub(replace_with, tok.lower_)
+
+    ret.__annotations__ = get_calling_function().__annotations__
+    return ret
+
+def replaceURLs(replace_with="URL") -> str:
+    ret = lambda tok: textacy.preprocess.replace_urls(tok.lower_,replace_with=replace_with)
+    #ret = lambda tok: urlFinder.sub(replace_with,tok.lower_)
+
+    ret.__annotations__ = get_calling_function().__annotations__
+    return ret
+
+def replaceTwitterMentions(replace_with="TWITTER_MENTION") -> str:
+    ret = lambda tok : mentionFinder.sub(replace_with,tok.lower_)
+
+    ret.__annotations__ = get_calling_function().__annotations__
+    return ret
+
+def replaceNumbers(replace_with="NUMBER") -> str:
+    ret = lambda tok: textacy.preprocess.replace_numbers(tok.lower_, replace_with=replace_with)
+
+    ret.__annotations__ = get_calling_function().__annotations__
+    return ret
+
+def replacePhonenumbers(replace_with="PHONENUMBER") -> str:
+    ret = lambda tok: textacy.preprocess.replace_phone_numbers(tok.lower_, replace_with=replace_with)
+
+    ret.__annotations__ = get_calling_function().__annotations__
+    return ret
+
+
+def resolveAbbreviations():
+    pass #todo
+
+
+metaliste = [
+    "Subject",
+    "categoryName",
+    "Solution"
+]
+path2csv = "M42-Export/Tickets_small.csv"
+
+
+
+clean_in_meta = {
+    "Solution":[removePOS(["SPACE"])],
+    "Subject":[removePOS(["SPACE","PUNCT"])]
+}
 
 
 clean_in_content=[
-    removePOS(["SPACE"]),
-    removeWords(["dezernat"]),
-    removePOS(["PUNCT"]),
+    removePOS(["SPACE","PUNCT","NUM"]),
+    keepPOS(["NOUN"]),
     replaceURLs(),
-    removePOS(["NUM"]),
-    lemmatize(),
-    removeWords(stop_words),
-    keepUniqueTokens(),
-    normalizeSynonyms()
+    replaceEmails(),
+    removeWords(de_stop_words),
+    lemmatize()
+
 ]
 
 
@@ -272,7 +425,8 @@ clean_in_content=[
 ## add files to textacy-corpus,
 print("add texts to textacy-corpus...")
 corpus.add_texts(
-    processTextstream(generateMainTextfromTicketXML(path2xml), clean_in_content),
+    processTextstream(csv_to_contentStream(path2csv,"Description"), clean_in_content),
+    processDictstream(csv_to_metaStream(path2csv,metaliste),clean_in_meta)
 )
 
 printRandomDoc(corpus)
@@ -287,3 +441,5 @@ printRandomDoc(corpus)
 
 
 
+end = time.time()
+print("\n\n\nTime Elapsed:{0}".format(end - start))
\ No newline at end of file