preprocessingpipe verfeinert

2017-09-11 17:29:54 +02:00 · 2017-09-11 17:29:54 +02:00 · e6548225e3
parent 8b96076337
commit e6548225e3
1 changed files with 122 additions and 0 deletions
--- a/test.py
+++ b/test.py
@ -0,0 +1,122 @@
+# -*- coding: utf-8 -*-
+import functools
+import re
+import xml.etree.ElementTree as ET
+
+import spacy
+import textacy
+
+path2xml = "ticketSamples.xml"
+import de_core_news_md
+
+
+PARSER = de_core_news_md.load()
+corpus = textacy.Corpus(PARSER)
+
+
+
+def generateMainTextfromTicketXML(path2xml, main_textfield='Beschreibung'):
+    """
+    generates strings from XML
+    :param path2xml:
+    :param main_textfield:
+    :param cleaning_function:
+    :yields strings
+    """
+    tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
+    root = tree.getroot()
+
+    for ticket in root:
+        for field in ticket:
+            if field.tag == main_textfield:
+                    yield field.text
+
+
+def printRandomDoc(textacyCorpus):
+    import random
+    print()
+
+    print("len(textacyCorpus) = %i" % len(textacyCorpus))
+    randIndex = int((len(textacyCorpus) - 1) * random.random())
+    print("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata))
+
+    print()
+
+
+
+def processTextstream(textstream, funclist, parser=PARSER):
+    # input:str-stream output:str-stream
+    pipe = parser.pipe(textstream)
+
+    for doc in pipe:
+        tokens = [tok for tok in doc]
+        for f in funclist:
+            tokens = filter(f,tokens)
+            #tokens = map(funclist,tokens)
+        yield " ".join([tok.lower_ for tok in tokens])
+
+
+
+
+def keepPOS(pos_list):
+    return lambda tok : tok.pos_ in pos_list
+
+def removePOS(pos_list):
+    return lambda tok : tok.pos_ not in pos_list
+
+def removeWords(words, keep=None):
+    #todo in:str oder str-list
+    if hasattr(keep, '__iter__'):
+        for k in keep:
+            try:
+                words.remove(k)
+            except ValueError:
+                pass
+    return lambda tok :  tok.lower_ not in words
+
+emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
+
+
+def replaceEmails(replace_with="EMAIL"):
+    return lambda tok : emailFinder.sub(replace_with, tok.lower_)
+
+
+stop_words=list(__import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)
+
+clean_in_content=[
+    removePOS(["SPACE"]),
+    removePOS(["PUNCT"]),
+    removeWords(stop_words,keep=["und"]),
+    replaceEmails
+]
+
+
+
+## add files to textacy-corpus,
+print("add texts to textacy-corpus...")
+corpus.add_texts(
+    processTextstream(generateMainTextfromTicketXML(path2xml), clean_in_content),
+)
+
+printRandomDoc(corpus)
+
+#todo https://stackoverflow.com/questions/15200048/how-to-get-the-parameters-type-and-return-type-of-a-function
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+