2017-09-19 14:42:38 +02:00
|
|
|
# -*- coding: utf-8 -*-
|
2017-09-20 15:22:13 +02:00
|
|
|
import time
|
|
|
|
start = time.time()
|
2017-09-19 14:42:38 +02:00
|
|
|
import corenlp as corenlp
|
|
|
|
import os
|
|
|
|
import re
|
2017-09-20 15:22:13 +02:00
|
|
|
|
|
|
|
import spacy
|
2017-09-19 14:42:38 +02:00
|
|
|
import textacy
|
|
|
|
import nltk
|
|
|
|
from textblob_de import TextBlobDE
|
|
|
|
from textblob_de import PatternParser
|
2017-09-20 15:22:13 +02:00
|
|
|
#from polyglot.text import Text
|
|
|
|
import hunspell
|
|
|
|
from postal.parser import parse_address
|
2017-09-19 14:42:38 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
2017-09-20 15:22:13 +02:00
|
|
|
print(parse_address(str(textacy.fileio.read_file("teststring.txt"))))
|
2017-09-19 14:42:38 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
regex_specialChars = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|</>?]'
|
|
|
|
|
|
|
|
|
|
|
|
def stringcleaning(stringstream, funclist):
|
|
|
|
for string in stringstream:
|
|
|
|
for f in funclist:
|
|
|
|
string = f(string)
|
|
|
|
yield string
|
|
|
|
|
|
|
|
|
|
|
|
def seperate_words_on_regex(regex=regex_specialChars):
|
|
|
|
return lambda string: " ".join(re.compile(regex).split(string))
|
|
|
|
|
|
|
|
|
|
|
|
words = [
|
|
|
|
"uniaccount",
|
|
|
|
"nr54065467",
|
|
|
|
"nr54065467",
|
|
|
|
"455a33c5,"
|
|
|
|
"tvt?=",
|
|
|
|
"tanja.saborowski@tu-dortmund.de",
|
|
|
|
"-",
|
|
|
|
"m-sw1-vl4053.itmc.tu-dortmund.de",
|
|
|
|
"------problem--------"
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE)
|
|
|
|
specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE)
|
|
|
|
|
|
|
|
for s in stringcleaning((w for w in words),[seperate_words_on_regex()]):
|
|
|
|
print(s.strip())
|
|
|
|
|
|
|
|
#print(stringcleaning(w,string_comp))
|
|
|
|
#print(bool(re.search(r'\.[a-z]{2,3}(\.[a-z]{2,3})?',w)))
|
|
|
|
#print(bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]',w)))
|
|
|
|
#result = specialFinder.sub(" ", w)
|
|
|
|
#print(re.sub(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]'," ",w))
|
|
|
|
|
|
|
|
#print(re.sub(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', " ", w))
|
2017-09-20 15:22:13 +02:00
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
de_stop_words= set(
|
|
|
|
list(__import__("spacy." + DE_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS) +
|
|
|
|
list(textacy.fileio.read_file_lines("stopwords-de.txt"))
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
LEMMAS = list(textacy.fileio.read_file_lines(filepath="lemmatization-de.txt"))
|
|
|
|
|
|
|
|
VORNAMEN = list(textacy.fileio.read_file_lines("vornamen.txt"))
|
|
|
|
|
|
|
|
|
|
|
|
#blob = Text(str(textacy.fileio.read_file("teststring.txt")))#,parser=PatternParser(pprint=True, lemmata=True))
|
|
|
|
|
|
|
|
#print(blob.entities)
|
|
|
|
|
|
|
|
de_stop_words = list(map(replaceRockDots(),de_stop_words))
|
|
|
|
LEMMAS = list(map(replaceRockDots(),LEMMAS))
|
|
|
|
VORNAMEN = list(map(replaceRockDots(),VORNAMEN))
|
|
|
|
|
|
|
|
de_stop_words = list(map(textacy.preprocess.normalize_whitespace,de_stop_words))
|
|
|
|
LEMMAS = list(map(textacy.preprocess.normalize_whitespace,LEMMAS))
|
|
|
|
VORNAMEN = list(map(textacy.preprocess.normalize_whitespace,VORNAMEN))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
textacy.fileio.write_file_lines(LEMMAS,"lemmas.txt")
|
|
|
|
textacy.fileio.write_file_lines(VORNAMEN,"firstnames.txt")
|
|
|
|
textacy.fileio.write_file_lines(de_stop_words,"de_stop_words.txt")
|
|
|
|
"""
|
|
|
|
|
|
|
|
end = time.time()
|
|
|
|
print("\n\n\nTime Elapsed Topic:{0}\n\n".format(end - start))
|
|
|
|
|