2017-10-25 09:46:44 +02:00
# -*- coding: utf-8 -*-
2017-12-08 11:06:07 +01:00
import os
2017-10-25 09:46:44 +02:00
import time
2017-12-08 11:06:07 +01:00
from datetime import datetime
2017-10-25 09:46:44 +02:00
import textacy
from scipy import *
2017-12-08 11:06:07 +01:00
from miscellaneous import *
2017-10-25 09:46:44 +02:00
2017-11-06 12:54:59 +01:00
2017-10-25 09:46:44 +02:00
csv . field_size_limit ( sys . maxsize )
2017-12-08 11:06:07 +01:00
FILEPATH = os . path . dirname ( os . path . realpath ( __file__ ) ) + " / "
2017-10-25 09:46:44 +02:00
# load config
config_ini = FILEPATH + " config.ini "
config = ConfigParser . ConfigParser ( )
with open ( config_ini ) as f :
config . read_file ( f )
############# stringcleaning
2017-12-08 11:06:07 +01:00
def clean ( stringstream ) : #, NOUNS):
2017-12-11 12:10:40 +01:00
"""
fix bad unicode
seperate_words_on_regex ` \= ~ % ^ & * ( ) _ + \[ \] { } ; \' " |</>
normalize whitespace
remove linebreaks
replaceRockDöts
: param stringstream : str - gen
: return : string - gen
"""
2017-10-25 09:46:44 +02:00
2017-12-08 11:06:07 +01:00
#NOUNS = [n.lower() for n in NOUNS]
2017-10-25 09:46:44 +02:00
for string in stringstream :
# fixUnicode
2017-12-08 11:06:07 +01:00
string = textacy . preprocess . fix_bad_unicode ( string )
#string = textacy.preprocess.unidecode(string)
2017-10-25 09:46:44 +02:00
# seperate_words_on_regex:
2017-12-19 17:12:35 +01:00
string = " " . join ( re . compile ( r ' [` \ =~ % ^&*()_+ \ [ \ ] {} ; \' " |</>] ' ) . split ( string ) ) #todo bla vllt lassen wir das hier? achaj: für header und footer vllt englische-spracherkennung und adressen parsing und grußfromelerkennung
2017-10-25 09:46:44 +02:00
#normalize whitespace
string = textacy . preprocess . normalize_whitespace ( string )
2017-12-19 17:12:35 +01:00
#todo bla textacy.preprocess.remove_accents(text, method=u'unicode')[source]
2017-10-25 09:46:44 +02:00
#remove linebreaks
2017-12-19 17:12:35 +01:00
string = re . sub ( r ' [ \ n] ' , " " , string ) #todo bla kann/soll raus? weil absätze vllt weas zu bedeuten haben
2017-10-25 09:46:44 +02:00
2017-12-19 17:12:35 +01:00
string = replaceRockDots ( string ) #todo bla gehört zu normalize
2017-10-25 09:46:44 +02:00
2017-12-08 11:06:07 +01:00
"""
# fehler großschreibung durch nomenliste zu korrigieren funzt nicht so richtig, da auch innerhalb des Statzes wörter verändert werden.
2017-10-25 09:46:44 +02:00
2017-12-08 11:06:07 +01:00
#for n in nouns:
# string = string.replace(n.lower(),n)
#string = multisub(nouns_tuples,string)
2017-10-25 09:46:44 +02:00
2017-12-08 11:06:07 +01:00
#https://stackoverflow.com/questions/10968558/python-re-sub-with-a-list-of-words-to-find
#string = re.sub(r'[\n]', " ", string)
#string = string.replace(noun,noun.title()) for noun in nouns
splitted = string . split ( )
for i , s in enumerate ( splitted ) :
2017-10-25 09:46:44 +02:00
2017-12-08 11:06:07 +01:00
if s in NOUNS :
splitted [ i ] = s . title ( )
if i != 0 :
for punct in " :.!? " :
if punct in splitted [ i - 1 ] :
splitted [ i ] = s . title ( )
2017-10-25 09:46:44 +02:00
2017-12-08 11:06:07 +01:00
string = " " . join ( splitted )
"""
2017-10-25 09:46:44 +02:00
2017-12-08 11:06:07 +01:00
yield string
2017-10-25 09:46:44 +02:00
2017-12-19 17:12:35 +01:00
def processDictstream_v2 ( dictstream , keys_to_clean ) :
for dic in dictstream :
result = { k : re . sub ( r ' [.!?] ' , " " , normalize_str ( v ) . lower ( ) ) if k in keys_to_clean else v for k , v in dic . items ( ) }
yield result
def processDictstream ( dictstream , funcdict , parser ) :
"""
: param dictstream : dict - gen
: param funcdict :
clean_in_meta = {
" Solution " : funclist ,
. . .
}
: param parser : spacy - parser
: return : dict - gen
"""
for dic in dictstream :
result = { }
for key , value in dic . items ( ) :
if key in funcdict :
doc = parser ( value )
tokens = [ tok for tok in doc ]
funclist = funcdict [ key ]
tokens = filterTokens ( tokens , funclist )
result [ key ] = " " . join ( [ tok . lower_ for tok in tokens ] )
else :
result [ key ] = value
yield result
def filterTokens ( tokens , funclist ) :
# in:tokenlist, funclist
# out: tokenlist
for f in funclist :
tokens = list ( filter ( f , tokens ) )
for tok in tokens :
if tok . pos_ == " NOUN " :
x = 0
return tokens
2017-10-25 09:46:44 +02:00
##################################################################################################
corpus_de_path = FILEPATH + config . get ( " de_corpus " , " path " )
2017-12-19 17:12:35 +01:00
def cleanCorpus ( corpus , clean_in_meta ) :
2017-12-08 11:06:07 +01:00
logprint ( " Clean {0} _corpus at {1} " . format ( corpus . lang , datetime . now ( ) ) )
2017-10-25 09:46:44 +02:00
2017-12-11 12:10:40 +01:00
"""
2017-12-08 11:06:07 +01:00
ressources_path = FILEPATH + " ressources/ "
2017-12-11 12:10:40 +01:00
2017-12-08 11:06:07 +01:00
path2nouns_list = ressources_path + config . get ( " nouns " , " pickle_file " )
2017-12-11 12:10:40 +01:00
2017-12-08 11:06:07 +01:00
#NOUNS = load_obj(path2nouns_list)
#noun_disjunction = '|'.join(NOUNS)
#nouns_tuples = []
#for n in NOUNS:
# nouns_tuples.append((n.lower(),n))
2017-12-11 12:10:40 +01:00
"""
2017-10-25 09:46:44 +02:00
2017-12-11 12:10:40 +01:00
# load Corpus
2017-12-08 11:06:07 +01:00
raw_corpus = corpus
parser = corpus . spacy_lang
2017-10-25 09:46:44 +02:00
2017-12-08 11:06:07 +01:00
# Actually clean the corpus
cleaned_corpus = textacy . Corpus ( parser )
2017-12-19 17:12:35 +01:00
2017-12-08 11:06:07 +01:00
cleaned_corpus . add_texts (
clean ( corpus2Text ( raw_corpus ) ) ,
2017-12-19 17:12:35 +01:00
#processDictstream(corpus2Meta(cleaned_corpus), clean_in_meta, parser=parser)
processDictstream_v2 ( corpus2Meta ( raw_corpus ) , clean_in_meta )
2017-10-25 09:46:44 +02:00
)
2017-12-11 12:10:40 +01:00
# leere docs aus corpus kicken
2017-12-08 11:06:07 +01:00
cleaned_corpus . remove ( lambda doc : len ( doc ) == 0 )
2017-10-25 09:46:44 +02:00
#save corpus
2017-12-11 12:10:40 +01:00
cleanCorpus_name = corpus . lang + " _clean "
2017-12-08 11:06:07 +01:00
save_corpus ( corpus = cleaned_corpus , corpus_path = corpus_de_path , corpus_name = cleanCorpus_name )
2017-10-25 09:46:44 +02:00
2017-12-08 11:06:07 +01:00
return cleaned_corpus
2017-10-25 09:46:44 +02:00
2017-12-19 17:12:35 +01:00
def removePOS ( pos_list ) :
return lambda tok : tok . pos_ not in pos_list
2017-10-25 09:46:44 +02:00
2017-12-08 11:06:07 +01:00
def main ( corpus ) :
2017-10-25 09:46:44 +02:00
start = time . time ( )
2017-12-19 17:12:35 +01:00
clean_in_meta = {
" Solution " : [ removePOS ( [ " SPACE " ] ) ] ,
" Subject " : [ removePOS ( [ " SPACE " , " PUNCT " ] ) ] ,
" categoryName " : [ removePOS ( [ " SPACE " , " PUNCT " ] ) ]
}
clean_in_meta = [ " Subject " , " categoryName " ]
2017-10-25 09:46:44 +02:00
2017-12-19 17:12:35 +01:00
cleaned_corpus = cleanCorpus ( corpus , clean_in_meta )
2017-12-08 11:06:07 +01:00
2017-10-25 09:46:44 +02:00
end = time . time ( )
logprint ( " Time Elapsed Cleaning: {0} min " . format ( ( end - start ) / 60 ) )
2017-12-08 11:06:07 +01:00
return cleaned_corpus
2017-10-25 09:46:44 +02:00
if __name__ == " __main__ " :
2017-12-08 11:06:07 +01:00
corpus , parser = load_corpus ( corpus_path = " /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/ " ,
corpus_name = " de_raw " )
main ( corpus )
2017-10-25 09:46:44 +02:00