topicModelingTickets/testra.py

# -*- coding: utf-8 -*-
import time
start = time.time()
import corenlp as corenlp
import os
import re

import spacy
import textacy
import nltk
from textblob_de import TextBlobDE
from textblob_de import PatternParser
#from polyglot.text import Text
import hunspell
from postal.parser import parse_address


print(parse_address(str(textacy.fileio.read_file("teststring.txt"))))


"""
regex_specialChars = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|</>?]'


def stringcleaning(stringstream, funclist):
    for string in stringstream:
        for f in funclist:
            string = f(string)
        yield string


def seperate_words_on_regex(regex=regex_specialChars):
    return lambda string: " ".join(re.compile(regex).split(string))


words = [
    "uniaccount",
    "nr54065467",
    "nr54065467",
    "455a33c5,"
    "tvt?=",
    "tanja.saborowski@tu-dortmund.de",
    "-",
    "m-sw1-vl4053.itmc.tu-dortmund.de",
    "------problem--------"
]


topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE)
specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE)

for s in stringcleaning((w for w in words),[seperate_words_on_regex()]):
    print(s.strip())

    #print(stringcleaning(w,string_comp))
    #print(bool(re.search(r'\.[a-z]{2,3}(\.[a-z]{2,3})?',w)))
    #print(bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]',w)))
    #result = specialFinder.sub(" ", w)
    #print(re.sub(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]'," ",w))

    #print(re.sub(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', " ", w))
"""


"""
de_stop_words= set(
    list(__import__("spacy." + DE_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS) +
    list(textacy.fileio.read_file_lines("stopwords-de.txt"))
)


LEMMAS = list(textacy.fileio.read_file_lines(filepath="lemmatization-de.txt"))

VORNAMEN = list(textacy.fileio.read_file_lines("vornamen.txt"))


#blob = Text(str(textacy.fileio.read_file("teststring.txt")))#,parser=PatternParser(pprint=True, lemmata=True))

#print(blob.entities)

de_stop_words = list(map(replaceRockDots(),de_stop_words))
LEMMAS = list(map(replaceRockDots(),LEMMAS))
VORNAMEN = list(map(replaceRockDots(),VORNAMEN))

de_stop_words = list(map(textacy.preprocess.normalize_whitespace,de_stop_words))
LEMMAS = list(map(textacy.preprocess.normalize_whitespace,LEMMAS))
VORNAMEN = list(map(textacy.preprocess.normalize_whitespace,VORNAMEN))


textacy.fileio.write_file_lines(LEMMAS,"lemmas.txt")
textacy.fileio.write_file_lines(VORNAMEN,"firstnames.txt")
textacy.fileio.write_file_lines(de_stop_words,"de_stop_words.txt")
"""

end = time.time()
print("\n\n\nTime Elapsed Topic:{0}\n\n".format(end - start))