+
+
+
+
+
+
\ No newline at end of file
diff --git a/java_LabledLDA/LICENSE b/java_LabledLDA/LICENSE
new file mode 100644
index 0000000..d159169
--- /dev/null
+++ b/java_LabledLDA/LICENSE
@@ -0,0 +1,339 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The licenses for most software are designed to take away your
+freedom to share and change it. By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users. This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it. (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.) You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+ To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have. You must make sure that they, too, receive or can get the
+source code. And you must show them these terms so they know their
+rights.
+
+ We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+ Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software. If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+ Finally, any free program is threatened constantly by software
+patents. We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary. To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ GNU GENERAL PUBLIC LICENSE
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License. The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language. (Hereinafter, translation is included without limitation in
+the term "modification".) Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+ 1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+ 2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+ a) You must cause the modified files to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ b) You must cause any work that you distribute or publish, that in
+ whole or in part contains or is derived from the Program or any
+ part thereof, to be licensed as a whole at no charge to all third
+ parties under the terms of this License.
+
+ c) If the modified program normally reads commands interactively
+ when run, you must cause it, when started running for such
+ interactive use in the most ordinary way, to print or display an
+ announcement including an appropriate copyright notice and a
+ notice that there is no warranty (or else, saying that you provide
+ a warranty) and that users may redistribute the program under
+ these conditions, and telling the user how to view a copy of this
+ License. (Exception: if the Program itself is interactive but
+ does not normally print such an announcement, your work based on
+ the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+ 3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+ a) Accompany it with the complete corresponding machine-readable
+ source code, which must be distributed under the terms of Sections
+ 1 and 2 above on a medium customarily used for software interchange; or,
+
+ b) Accompany it with a written offer, valid for at least three
+ years, to give any third party, for a charge no more than your
+ cost of physically performing source distribution, a complete
+ machine-readable copy of the corresponding source code, to be
+ distributed under the terms of Sections 1 and 2 above on a medium
+ customarily used for software interchange; or,
+
+ c) Accompany it with the information you received as to the offer
+ to distribute corresponding source code. (This alternative is
+ allowed only for noncommercial distribution and only if you
+ received the program in object code or executable form with such
+ an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it. For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable. However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+ 4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License. Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+ 5. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Program or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+ 6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+ 7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all. For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+ 8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded. In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+ 9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation. If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+ 10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission. For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this. Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+ NO WARRANTY
+
+ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+
+ Copyright (C)
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+ Gnomovision version 69, Copyright (C) year name of author
+ Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary. Here is a sample; alter the names:
+
+ Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+ `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+ , 1 April 1989
+ Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs. If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library. If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
diff --git a/java_LabledLDA/LabledLDA.iml b/java_LabledLDA/LabledLDA.iml
new file mode 100644
index 0000000..39f2db5
--- /dev/null
+++ b/java_LabledLDA/LabledLDA.iml
@@ -0,0 +1,13 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/java_LabledLDA/README.md b/java_LabledLDA/README.md
new file mode 100644
index 0000000..a5d1262
--- /dev/null
+++ b/java_LabledLDA/README.md
@@ -0,0 +1,109 @@
+Labeled LDA in Java (based on JGibbLDA)
+=======================================
+
+This is a Java implementation of Labeled LDA based on the popular
+[JGibbLDA](http://jgibblda.sourceforge.net/) package. The code has been heavily
+refactored and a few additional options have been added. See sections below for
+more details.
+
+Data Format
+-----------
+
+The input data format is similar to the [JGibbLDA input data
+format](http://jgibblda.sourceforge.net/#_2.3._Input_Data_Format), with some
+minor cosmetic changes and additional support for document labels necessary for
+Labeled LDA. We first describe the (modified) input format for unlabeled
+documents, followed by the (new) input format for labeled documents.
+
+**Changed from JGibbLDA**: All input/output files must be Gzipped.
+
+### Unlabeled Documents
+
+Unlabeled documents have the following format:
+
+ document_1
+ document_2
+ ...
+ document_m
+
+where each document is a space-separated list of terms, i.e.,:
+
+ document_i = term_1 term_2 ... term_n
+
+**Changed from JGibbLDA**: The first line *should not* be an integer indicating
+the number of documents in the file. The original JGibbLDA code has been
+modified to identify the number of documents automatically.
+
+**Note**: Labeled and unlabeled documents may be mixed in the input file, thus
+you must ensure that unlabeled documents do not begin with a left square bracket
+(see Labeled Document input format below). One easy fix is to prepend a space
+character (' ') to each unlabeled document line.
+
+### Labeled Documents
+
+Labeled documents follow a format similar to unlabeled documents, but the with
+labels given at the beginning of each line and surrounded by square brackets,
+e.g.:
+
+ [label_1,1 label_1,2 ... label_1,l_1] document_1
+ [label_2,1 label_2,2 ... label_2,l_2] document_2
+ ...
+ [label_m,1 label_m,2 ... label_m,l_m] document_m
+
+where each label is an integer in the range [0, K-1], for K equal to the number
+of topics (-ntopics).
+
+**Note**: Labeled and unlabeled documents may be mixed in the input file. An
+unlabeled document is equivalent to labeling a document with every label in the
+range [0, K-1].
+
+Usage
+-----
+
+Please see the [JGibbLDA usage](http://jgibblda.sourceforge.net/#_2.2._Command_Line_&_Input_Parameter), noting the following changes:
+
+* All input files must be Gzipped. All output files are also Gzipped.
+
+* New options have been added:
+
+ **-nburnin **: Discard this many initial iterations when taking samples.
+
+ **-samplinglag **: The number of iterations between samples.
+
+ **-infseparately**: Inference is done separately for each document, as if
+ inference for each document was performed in isolation.
+
+ **-unlabeled**: Ignore document labels, i.e., treat every document as
+ unlabeled.
+
+* Some options have been deleted:
+
+ **-wordmap**: Filename is automatically built based on model path.
+
+Contact
+-------
+
+Please direct questions to [Myle Ott](myleott@gmail.com).
+
+License
+-------
+
+Following JGibbLDA, this code is licensed under the GPLv2. Please see the
+LICENSE file for the full license.
+
+Labeled LDA in Java
+Copyright (C) 2008-2013 Myle Ott (Labeled LDA), Xuan-Hieu Phan and Cam-Tu Nguyen (JGibbLDA)
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
diff --git a/java_LabledLDA/models/tickets/.others.gz b/java_LabledLDA/models/tickets/.others.gz
new file mode 100644
index 0000000..bd57ad4
Binary files /dev/null and b/java_LabledLDA/models/tickets/.others.gz differ
diff --git a/java_LabledLDA/models/tickets/.tassign.gz b/java_LabledLDA/models/tickets/.tassign.gz
new file mode 100644
index 0000000..07bf22c
Binary files /dev/null and b/java_LabledLDA/models/tickets/.tassign.gz differ
diff --git a/java_LabledLDA/models/tickets/.theta.gz b/java_LabledLDA/models/tickets/.theta.gz
new file mode 100644
index 0000000..8dc11ae
Binary files /dev/null and b/java_LabledLDA/models/tickets/.theta.gz differ
diff --git a/java_LabledLDA/models/tickets/.twords.gz b/java_LabledLDA/models/tickets/.twords.gz
new file mode 100644
index 0000000..608a547
Binary files /dev/null and b/java_LabledLDA/models/tickets/.twords.gz differ
diff --git a/java_LabledLDA/models/tickets/.wordmap.gz b/java_LabledLDA/models/tickets/.wordmap.gz
new file mode 100644
index 0000000..4df13f8
Binary files /dev/null and b/java_LabledLDA/models/tickets/.wordmap.gz differ
diff --git a/java_LabledLDA/models/tickets/tickets.gz b/java_LabledLDA/models/tickets/tickets.gz
new file mode 100644
index 0000000..8d3fe3a
Binary files /dev/null and b/java_LabledLDA/models/tickets/tickets.gz differ
diff --git a/java_LabledLDA/out/production/LabledLDA/jgibblda/Dictionary.class b/java_LabledLDA/out/production/LabledLDA/jgibblda/Dictionary.class
new file mode 100644
index 0000000..ba24d8f
Binary files /dev/null and b/java_LabledLDA/out/production/LabledLDA/jgibblda/Dictionary.class differ
diff --git a/java_LabledLDA/out/production/LabledLDA/jgibblda/Document.class b/java_LabledLDA/out/production/LabledLDA/jgibblda/Document.class
new file mode 100644
index 0000000..defecd7
Binary files /dev/null and b/java_LabledLDA/out/production/LabledLDA/jgibblda/Document.class differ
diff --git a/java_LabledLDA/out/production/LabledLDA/jgibblda/Estimator.class b/java_LabledLDA/out/production/LabledLDA/jgibblda/Estimator.class
new file mode 100644
index 0000000..efa7ee8
Binary files /dev/null and b/java_LabledLDA/out/production/LabledLDA/jgibblda/Estimator.class differ
diff --git a/java_LabledLDA/out/production/LabledLDA/jgibblda/Inferencer.class b/java_LabledLDA/out/production/LabledLDA/jgibblda/Inferencer.class
new file mode 100644
index 0000000..9ec8304
Binary files /dev/null and b/java_LabledLDA/out/production/LabledLDA/jgibblda/Inferencer.class differ
diff --git a/java_LabledLDA/out/production/LabledLDA/jgibblda/LDA.class b/java_LabledLDA/out/production/LabledLDA/jgibblda/LDA.class
new file mode 100644
index 0000000..3e6ed60
Binary files /dev/null and b/java_LabledLDA/out/production/LabledLDA/jgibblda/LDA.class differ
diff --git a/java_LabledLDA/out/production/LabledLDA/jgibblda/LDACmdOption.class b/java_LabledLDA/out/production/LabledLDA/jgibblda/LDACmdOption.class
new file mode 100644
index 0000000..b0f7878
Binary files /dev/null and b/java_LabledLDA/out/production/LabledLDA/jgibblda/LDACmdOption.class differ
diff --git a/java_LabledLDA/out/production/LabledLDA/jgibblda/LDADataset.class b/java_LabledLDA/out/production/LabledLDA/jgibblda/LDADataset.class
new file mode 100644
index 0000000..f9dd6cf
Binary files /dev/null and b/java_LabledLDA/out/production/LabledLDA/jgibblda/LDADataset.class differ
diff --git a/java_LabledLDA/out/production/LabledLDA/jgibblda/Model.class b/java_LabledLDA/out/production/LabledLDA/jgibblda/Model.class
new file mode 100644
index 0000000..1d97a68
Binary files /dev/null and b/java_LabledLDA/out/production/LabledLDA/jgibblda/Model.class differ
diff --git a/java_LabledLDA/out/production/LabledLDA/jgibblda/Pair.class b/java_LabledLDA/out/production/LabledLDA/jgibblda/Pair.class
new file mode 100644
index 0000000..df577a4
Binary files /dev/null and b/java_LabledLDA/out/production/LabledLDA/jgibblda/Pair.class differ
diff --git a/java_LabledLDA/src/jgibblda/Dictionary.java b/java_LabledLDA/src/jgibblda/Dictionary.java
new file mode 100644
index 0000000..cbff86b
--- /dev/null
+++ b/java_LabledLDA/src/jgibblda/Dictionary.java
@@ -0,0 +1,150 @@
+/*
+ * Copyright (C) 2007 by
+ *
+ * Xuan-Hieu Phan
+ * hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com
+ * Graduate School of Information Sciences
+ * Tohoku University
+ *
+ * Cam-Tu Nguyen
+ * ncamtu@gmail.com
+ * College of Technology
+ * Vietnam National University, Hanoi
+ *
+ * JGibbsLDA is a free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License,
+ * or (at your option) any later version.
+ *
+ * JGibbsLDA is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with JGibbsLDA; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ */
+package jgibblda;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.util.zip.GZIPInputStream;
+import java.util.zip.GZIPOutputStream;
+
+import gnu.trove.map.hash.TObjectIntHashMap;
+import gnu.trove.map.hash.TIntObjectHashMap;
+
+public class Dictionary {
+ public TObjectIntHashMap word2id;
+ public TIntObjectHashMap id2word;
+
+ //--------------------------------------------------
+ // constructors
+ //--------------------------------------------------
+
+ public Dictionary(){
+ word2id = new TObjectIntHashMap();
+ id2word = new TIntObjectHashMap();
+ }
+
+ //---------------------------------------------------
+ // get/set methods
+ //---------------------------------------------------
+
+ public String getWord(int id){
+ return id2word.get(id);
+ }
+
+ public int getID(String word){
+ return word2id.get(word);
+ }
+
+ //----------------------------------------------------
+ // checking methods
+ //----------------------------------------------------
+ /**
+ * check if this dictionary contains a specified word
+ */
+ public boolean contains(String word){
+ return word2id.containsKey(word);
+ }
+
+ public boolean contains(int id){
+ return id2word.containsKey(id);
+ }
+ //---------------------------------------------------
+ // manupulating methods
+ //---------------------------------------------------
+ /**
+ * add a word into this dictionary
+ * return the corresponding id
+ */
+ public int addWord(String word){
+ if (!contains(word)){
+ int id = word2id.size();
+
+ word2id.put(word, id);
+ id2word.put(id,word);
+
+ return id;
+ }
+ else return getID(word);
+ }
+
+ //---------------------------------------------------
+ // I/O methods
+ //---------------------------------------------------
+ /**
+ * read dictionary from file
+ */
+ public boolean readWordMap(String wordMapFile)
+ {
+ try {
+ BufferedReader reader = new BufferedReader(new InputStreamReader(
+ new GZIPInputStream(
+ new FileInputStream(wordMapFile)), "UTF-8"));
+ String line;
+
+ for (int i = 0; (line = reader.readLine()) != null; i++) {
+ String word = line.trim();
+ id2word.put(i, word);
+ word2id.put(word, i);
+ }
+
+ reader.close();
+ return true;
+ }
+ catch (Exception e) {
+ System.out.println("Error while reading dictionary:" + e.getMessage());
+ e.printStackTrace();
+ return false;
+ }
+ }
+
+ public boolean writeWordMap(String wordMapFile)
+ {
+ try {
+ BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
+ new GZIPOutputStream(
+ new FileOutputStream(wordMapFile)), "UTF-8"));
+
+ //write word to id
+ for (int i = 0; i < id2word.size(); i++) {
+ writer.write(id2word.get(i) + "\n");
+ }
+
+ writer.close();
+ return true;
+ }
+ catch (Exception e) {
+ System.out.println("Error while writing word map " + e.getMessage());
+ e.printStackTrace();
+ return false;
+ }
+ }
+}
diff --git a/java_LabledLDA/src/jgibblda/Document.java b/java_LabledLDA/src/jgibblda/Document.java
new file mode 100644
index 0000000..2bf8ceb
--- /dev/null
+++ b/java_LabledLDA/src/jgibblda/Document.java
@@ -0,0 +1,62 @@
+/*
+ * Copyright (C) 2007 by
+ *
+ * Xuan-Hieu Phan
+ * hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com
+ * Graduate School of Information Sciences
+ * Tohoku University
+ *
+ * Cam-Tu Nguyen
+ * ncamtu@gmail.com
+ * College of Technology
+ * Vietnam National University, Hanoi
+ *
+ * JGibbsLDA is a free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License,
+ * or (at your option) any later version.
+ *
+ * JGibbsLDA is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with JGibbsLDA; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ */
+
+package jgibblda;
+
+import gnu.trove.list.array.TIntArrayList;
+
+public class Document {
+
+ //----------------------------------------------------
+ //Instance Variables
+ //----------------------------------------------------
+ public int[] words;
+ public String rawStr = "";
+ public int length;
+ public int[] labels = null;
+
+ public Document(TIntArrayList doc){
+ this.length = doc.size();
+ this.words = new int[length];
+ for (int i = 0; i < length; i++){
+ this.words[i] = doc.get(i);
+ }
+ }
+
+ public Document(TIntArrayList doc, String rawStr)
+ {
+ this(doc);
+ this.rawStr = rawStr;
+ }
+
+ public Document(TIntArrayList doc, String rawStr, TIntArrayList tlabels)
+ {
+ this(doc, rawStr);
+ this.labels = tlabels != null ? tlabels.toArray() : null;
+ }
+}
diff --git a/java_LabledLDA/src/jgibblda/Estimator.java b/java_LabledLDA/src/jgibblda/Estimator.java
new file mode 100644
index 0000000..c7cd757
--- /dev/null
+++ b/java_LabledLDA/src/jgibblda/Estimator.java
@@ -0,0 +1,145 @@
+/*
+ * Copyright (C) 2007 by
+ *
+ * Xuan-Hieu Phan
+ * hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com
+ * Graduate School of Information Sciences
+ * Tohoku University
+ *
+ * Cam-Tu Nguyen
+ * ncamtu@gmail.com
+ * College of Technology
+ * Vietnam National University, Hanoi
+ *
+ * JGibbsLDA is a free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License,
+ * or (at your option) any later version.
+ *
+ * JGibbsLDA is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with JGibbsLDA; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ */
+
+package jgibblda;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+
+public class Estimator
+{
+ // output model
+ protected Model trnModel;
+ LDACmdOption option;
+
+ public Estimator(LDACmdOption option) throws FileNotFoundException, IOException
+ {
+ this.option = option;
+
+ trnModel = new Model(option);
+
+ if (option.est){
+ trnModel.init(true);
+ }
+ else if (option.estc){
+ trnModel.init(false);
+ }
+ }
+
+ public void estimate()
+ {
+ System.out.println("Sampling " + trnModel.niters + " iterations!");
+ System.out.print("Iteration");
+ for (int startIter = ++trnModel.liter; trnModel.liter <= startIter - 1 + trnModel.niters; trnModel.liter++){
+ System.out.format("%6d", trnModel.liter);
+
+ // for all z_i
+ for (int m = 0; m < trnModel.M; m++){
+ for (int n = 0; n < trnModel.data.docs.get(m).length; n++){
+ // z_i = z[m][n]
+ // sample from p(z_i|z_-i, w)
+ int topic = sampling(m, n);
+ trnModel.z[m].set(n, topic);
+ }// end for each word
+ }// end for each document
+
+ if ((trnModel.liter == startIter - 1 + trnModel.niters) ||
+ (trnModel.liter > trnModel.nburnin && trnModel.liter % trnModel.samplingLag == 0)) {
+ trnModel.updateParams();
+ }
+
+ System.out.print("\b\b\b\b\b\b");
+ }// end iterations
+ trnModel.liter--;
+
+ System.out.println("\nSaving the final model!");
+ trnModel.saveModel();
+ }
+
+ /**
+ * Do sampling
+ * @param m document number
+ * @param n word number
+ * @return topic id
+ */
+ public int sampling(int m, int n)
+ {
+ // remove z_i from the count variable
+ int topic = trnModel.z[m].get(n);
+ int w = trnModel.data.docs.get(m).words[n];
+
+ trnModel.nw[w][topic] -= 1;
+ trnModel.nd[m][topic] -= 1;
+ trnModel.nwsum[topic] -= 1;
+ trnModel.ndsum[m] -= 1;
+
+ double Vbeta = trnModel.V * trnModel.beta;
+
+ // get labels for this document
+ int[] labels = trnModel.data.docs.get(m).labels;
+
+ // determine number of possible topics for this document
+ int K_m = (labels == null) ? trnModel.K : labels.length;
+
+ // do multinominal sampling via cumulative method
+ double[] p = trnModel.p;
+ for (int k = 0; k < K_m; k++) {
+ topic = labels == null ? k : labels[k];
+
+ p[k] = (trnModel.nd[m][topic] + trnModel.alpha) *
+ (trnModel.nw[w][topic] + trnModel.beta) /
+ (trnModel.nwsum[topic] + Vbeta);
+ }
+
+ // cumulate multinomial parameters
+ for (int k = 1; k < K_m; k++) {
+ p[k] += p[k - 1];
+ }
+
+ // scaled sample because of unnormalized p[]
+ double u = Math.random() * p[K_m - 1];
+
+ for (topic = 0; topic < K_m; topic++){
+ if (p[topic] > u) //sample topic w.r.t distribution p
+ break;
+ }
+
+ // map [0, K_m - 1] topic to [0, K - 1] topic according to labels
+ if (labels != null) {
+ topic = labels[topic];
+ }
+
+ // add newly estimated z_i to count variables
+ trnModel.nw[w][topic] += 1;
+ trnModel.nd[m][topic] += 1;
+ trnModel.nwsum[topic] += 1;
+ trnModel.ndsum[m] += 1;
+
+ return topic;
+ }
+}
diff --git a/java_LabledLDA/src/jgibblda/Inferencer.java b/java_LabledLDA/src/jgibblda/Inferencer.java
new file mode 100644
index 0000000..159557b
--- /dev/null
+++ b/java_LabledLDA/src/jgibblda/Inferencer.java
@@ -0,0 +1,178 @@
+/*
+ * Copyright (C) 2007 by
+ *
+ * Xuan-Hieu Phan
+ * hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com
+ * Graduate School of Information Sciences
+ * Tohoku University
+ *
+ * Cam-Tu Nguyen
+ * ncamtu@gmail.com
+ * College of Technology
+ * Vietnam National University, Hanoi
+ *
+ * JGibbsLDA is a free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License,
+ * or (at your option) any later version.
+ *
+ * JGibbsLDA is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with JGibbsLDA; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ */
+
+package jgibblda;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+
+public class Inferencer
+{
+ // Train model
+ public Model trnModel;
+ public Dictionary globalDict;
+ private LDACmdOption option;
+
+ private Model newModel;
+
+ //-----------------------------------------------------
+ // Init method
+ //-----------------------------------------------------
+ public Inferencer(LDACmdOption option) throws FileNotFoundException, IOException
+ {
+ this.option = option;
+
+ trnModel = new Model(option);
+ trnModel.init(false);
+
+ globalDict = trnModel.data.localDict;
+ }
+
+ //inference new model ~ getting data from a specified dataset
+ public Model inference() throws FileNotFoundException, IOException
+ {
+ newModel = new Model(option, trnModel);
+ newModel.init(true);
+ newModel.initInf();
+
+ System.out.println("Sampling " + newModel.niters + " iterations for inference!");
+ System.out.print("Iteration");
+ for (newModel.liter = 1; newModel.liter <= newModel.niters; newModel.liter++){
+ System.out.format("%6d", newModel.liter);
+
+ // for all newz_i
+ for (int m = 0; m < newModel.M; ++m){
+ for (int n = 0; n < newModel.data.docs.get(m).length; n++){
+ // sample from p(z_i|z_-1,w)
+ int topic = infSampling(m, n);
+ newModel.z[m].set(n, topic);
+ }
+ }//end foreach new doc
+
+ if ((newModel.liter == newModel.niters) ||
+ (newModel.liter > newModel.nburnin && newModel.liter % newModel.samplingLag == 0)) {
+ newModel.updateParams(trnModel);
+ }
+
+ System.out.print("\b\b\b\b\b\b");
+ }// end iterations
+ newModel.liter--;
+
+ System.out.println("\nSaving the inference outputs!");
+ String outputPrefix = newModel.dfile;
+ if (outputPrefix.endsWith(".gz")) {
+ outputPrefix = outputPrefix.substring(0, outputPrefix.length() - 3);
+ }
+ newModel.saveModel(outputPrefix + ".");
+
+ return newModel;
+ }
+
+ /**
+ * do sampling for inference
+ * m: document number
+ * n: word number?
+ */
+ protected int infSampling(int m, int n)
+ {
+ // remove z_i from the count variables
+ int topic = newModel.z[m].get(n);
+ int _w = newModel.data.docs.get(m).words[n];
+ int w = newModel.data.lid2gid.get(_w);
+
+ newModel.nw[_w][topic] -= 1;
+ newModel.nd[m][topic] -= 1;
+ newModel.nwsum[topic] -= 1;
+ newModel.ndsum[m] -= 1;
+
+ int[] nw_inf_m__w = null;
+ if (option.infSeparately) {
+ nw_inf_m__w = newModel.nw_inf.get(m).get(_w);
+ nw_inf_m__w[topic] -= 1;
+ newModel.nwsum_inf[m][topic] -= 1;
+ }
+
+ double Vbeta = trnModel.V * newModel.beta;
+
+ // get labels for this document
+ int[] labels = newModel.data.docs.get(m).labels;
+
+ // determine number of possible topics for this document
+ int K_m = (labels == null) ? newModel.K : labels.length;
+
+ // do multinomial sampling via cumulative method
+ double[] p = newModel.p;
+ for (int k = 0; k < K_m; k++) {
+ topic = labels == null ? k : labels[k];
+
+ int nw_k, nwsum_k;
+ if (option.infSeparately) {
+ nw_k = nw_inf_m__w[topic];
+ nwsum_k = newModel.nwsum_inf[m][topic];
+ } else {
+ nw_k = newModel.nw[_w][topic];
+ nwsum_k = newModel.nwsum[topic];
+ }
+
+ p[k] = (newModel.nd[m][topic] + newModel.alpha) *
+ (trnModel.nw[w][topic] + nw_k + newModel.beta) /
+ (trnModel.nwsum[topic] + nwsum_k + Vbeta);
+ }
+
+ // cumulate multinomial parameters
+ for (int k = 1; k < K_m; k++){
+ p[k] += p[k - 1];
+ }
+
+ // scaled sample because of unnormalized p[]
+ double u = Math.random() * p[K_m - 1];
+
+ for (topic = 0; topic < K_m; topic++){
+ if (p[topic] > u)
+ break;
+ }
+
+ // map [0, K_m - 1] topic to [0, K - 1] topic according to labels
+ if (labels != null) {
+ topic = labels[topic];
+ }
+
+ // add newly estimated z_i to count variables
+ newModel.nw[_w][topic] += 1;
+ newModel.nd[m][topic] += 1;
+ newModel.nwsum[topic] += 1;
+ newModel.ndsum[m] += 1;
+
+ if (option.infSeparately) {
+ nw_inf_m__w[topic] += 1;
+ newModel.nwsum_inf[m][topic] += 1;
+ }
+
+ return topic;
+ }
+}
diff --git a/java_LabledLDA/src/jgibblda/LDA.java b/java_LabledLDA/src/jgibblda/LDA.java
new file mode 100644
index 0000000..87e24e9
--- /dev/null
+++ b/java_LabledLDA/src/jgibblda/LDA.java
@@ -0,0 +1,78 @@
+/*
+ * Copyright (C) 2007 by
+ *
+ * Xuan-Hieu Phan
+ * hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com
+ * Graduate School of Information Sciences
+ * Tohoku University
+ *
+ * Cam-Tu Nguyen
+ * ncamtu@gmail.com
+ * College of Technology
+ * Vietnam National University, Hanoi
+ *
+ * JGibbsLDA is a free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License,
+ * or (at your option) any later version.
+ *
+ * JGibbsLDA is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with JGibbsLDA; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ */
+
+package jgibblda;
+
+import java.io.FileNotFoundException;
+
+import org.kohsuke.args4j.*;
+
+
+public class LDA
+{
+ public static void main(String args[])
+ {
+ LDACmdOption option = new LDACmdOption();
+ CmdLineParser parser = new CmdLineParser(option);
+
+ try {
+ if (args.length == 0){
+ showHelp(parser);
+ return;
+ }
+
+ parser.parseArgument(args);
+
+ if (option.est || option.estc){
+ Estimator estimator = new Estimator(option);
+ estimator.estimate();
+ }
+ else if (option.inf){
+ Inferencer inferencer = new Inferencer(option);
+ Model newModel = inferencer.inference();
+ }
+ } catch (CmdLineException cle){
+ System.out.println("Command line error: " + cle.getMessage());
+ showHelp(parser);
+ return;
+ } catch (FileNotFoundException e) {
+ e.printStackTrace();
+ return;
+ } catch (Exception e){
+ System.out.println("Error in main: " + e.getMessage());
+ e.printStackTrace();
+ return;
+ }
+ }
+
+ public static void showHelp(CmdLineParser parser){
+ System.out.println("LDA [options ...] [arguments...]");
+ parser.printUsage(System.out);
+ }
+
+}
diff --git a/java_LabledLDA/src/jgibblda/LDACmdOption.java b/java_LabledLDA/src/jgibblda/LDACmdOption.java
new file mode 100644
index 0000000..e4ab0b4
--- /dev/null
+++ b/java_LabledLDA/src/jgibblda/LDACmdOption.java
@@ -0,0 +1,51 @@
+package jgibblda;
+
+import org.kohsuke.args4j.*;
+
+public class LDACmdOption {
+
+ @Option(name="-est", usage="Specify whether we want to estimate model from scratch")
+ public boolean est = false;
+
+ @Option(name="-estc", usage="Specify whether we want to continue the last estimation")
+ public boolean estc = false;
+
+ @Option(name="-inf", usage="Specify whether we want to do inference")
+ public boolean inf = true;
+
+ @Option(name="-infseparately", usage="Do inference for each document separately")
+ public boolean infSeparately = false;
+
+ @Option(name="-unlabeled", usage="Ignore document labels")
+ public boolean unlabeled = false;
+
+ @Option(name="-dir", usage="Specify directory")
+ public String dir = "";
+
+ @Option(name="-dfile", usage="Specify data file (*.gz)")
+ public String dfile = "";
+
+ @Option(name="-model", usage="Specify the model name")
+ public String modelName = "";
+
+ @Option(name="-alpha", usage="Specify alpha")
+ public double alpha = -1;
+
+ @Option(name="-beta", usage="Specify beta")
+ public double beta = -1;
+
+ @Option(name="-ntopics", usage="Specify the number of topics")
+ public int K = 100;
+
+ @Option(name="-niters", usage="Specify the number of iterations")
+ public int niters = 1000;
+
+ @Option(name="-nburnin", usage="Specify the number of burn-in iterations")
+ public int nburnin = 500;
+
+ @Option(name="-samplinglag", usage="Specify the sampling lag")
+ public int samplingLag = 5;
+
+ @Option(name="-twords", usage="Specify the number of most likely words to be printed for each topic")
+ public int twords = 100;
+}
diff --git a/java_LabledLDA/src/jgibblda/LDADataset.java b/java_LabledLDA/src/jgibblda/LDADataset.java
new file mode 100644
index 0000000..2ff89af
--- /dev/null
+++ b/java_LabledLDA/src/jgibblda/LDADataset.java
@@ -0,0 +1,179 @@
+/*
+ * Copyright (C) 2007 by
+ *
+ * Xuan-Hieu Phan
+ * hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com
+ * Graduate School of Information Sciences
+ * Tohoku University
+ *
+ * Cam-Tu Nguyen
+ * ncamtu@gmail.com
+ * College of Technology
+ * Vietnam National University, Hanoi
+ *
+ * JGibbsLDA is a free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License,
+ * or (at your option) any later version.
+ *
+ * JGibbsLDA is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with JGibbsLDA; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ */
+package jgibblda;
+
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.zip.GZIPInputStream;
+
+import gnu.trove.list.array.TIntArrayList;
+import gnu.trove.map.hash.TIntIntHashMap;
+import gnu.trove.set.hash.TIntHashSet;
+
+public class LDADataset {
+ //---------------------------------------------------------------
+ // Instance Variables
+ //---------------------------------------------------------------
+
+ public Dictionary localDict = new Dictionary(); // local dictionary
+ public ArrayList docs = new ArrayList(); // a list of documents
+ public int M = 0; // number of documents
+ public int V = 0; // number of words
+
+ // map from local coordinates (id) to global ones
+ // null if the global dictionary is not set
+ public TIntIntHashMap lid2gid = null;
+
+ //link to a global dictionary (optional), null for train data, not null for test data
+ public Dictionary globalDict = null;
+
+ //-------------------------------------------------------------
+ //Public Instance Methods
+ //-------------------------------------------------------------
+ public void setM(int M)
+ {
+ this.M = M;
+ }
+
+ public void setDictionary(Dictionary globalDict)
+ {
+ lid2gid = new TIntIntHashMap();
+ this.globalDict = globalDict;
+ }
+
+ /**
+ * set the document at the index idx if idx is greater than 0 and less than M
+ * @param doc document to be set
+ * @param idx index in the document array
+ */
+ public void setDoc(Document doc, int idx){
+ if (idx < docs.size()) {
+ docs.set(idx, doc);
+ } else {
+ docs.add(idx, doc);
+ }
+ }
+
+ /**
+ * add a new document
+ * @param str string contains doc
+ */
+ public void addDoc(String str, boolean unlabeled)
+ {
+ // read document labels (if provided)
+ TIntArrayList labels = null;
+ if (str.startsWith("[")) {
+ String[] labelsBoundary = str.
+ substring(1). // remove initial '['
+ split("]", 2); // separate labels and str between ']'
+ String[] labelStrs = labelsBoundary[0].trim().split("[ \\t]");
+ str = labelsBoundary[1].trim();
+
+ // parse labels (unless we're ignoring the labels)
+ if (!unlabeled) {
+ // store labels in a HashSet to ensure uniqueness
+ TIntHashSet label_set = new TIntHashSet();
+ for (String labelStr : labelStrs) {
+ try {
+ label_set.add(Integer.parseInt(labelStr.trim()));
+ } catch (NumberFormatException nfe) {
+ System.err.println("Unknown document label ( " + labelStr + " ) for document " + docs.size() + ".");
+ }
+ }
+ labels = new TIntArrayList(label_set);
+ labels.sort();
+ }
+ }
+
+ String[] words = str.split("[ \\t\\n]");
+ TIntArrayList ids = new TIntArrayList();
+ for (String word : words){
+ if (word.trim().equals("")) {
+ continue;
+ }
+
+ int _id = localDict.word2id.size();
+
+ if (localDict.contains(word))
+ _id = localDict.getID(word);
+
+ if (globalDict != null) {
+ //get the global id
+ if (globalDict.contains(word)) {
+ localDict.addWord(word);
+
+ lid2gid.put(_id, globalDict.getID(word));
+ ids.add(_id);
+ }
+ }
+ else {
+ localDict.addWord(word);
+ ids.add(_id);
+ }
+ }
+
+ setDoc(new Document(ids, str, labels), docs.size());
+
+ V = localDict.word2id.size();
+ }
+
+ //---------------------------------------------------------------
+ // I/O methods
+ //---------------------------------------------------------------
+
+ /**
+ * read a dataset from a file
+ * @return true if success and false otherwise
+ */
+ public boolean readDataSet(String filename, boolean unlabeled) throws FileNotFoundException, IOException
+ {
+ BufferedReader reader = new BufferedReader(new InputStreamReader(
+ new GZIPInputStream(
+ new FileInputStream(filename)), "UTF-8"));
+ try {
+ String line;
+ while ((line = reader.readLine()) != null) {
+ addDoc(line, unlabeled);
+ }
+ setM(docs.size());
+
+ // debug output
+ System.out.println("Dataset loaded:");
+ System.out.println("\tM:" + M);
+ System.out.println("\tV:" + V);
+
+ return true;
+ } finally {
+ reader.close();
+ }
+ }
+}
diff --git a/java_LabledLDA/src/jgibblda/Model.java b/java_LabledLDA/src/jgibblda/Model.java
new file mode 100644
index 0000000..22f4c09
--- /dev/null
+++ b/java_LabledLDA/src/jgibblda/Model.java
@@ -0,0 +1,669 @@
+/*
+ * Copyright (C) 2007 by
+ *
+ * Xuan-Hieu Phan
+ * hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com
+ * Graduate School of Information Sciences
+ * Tohoku University
+ *
+ * Cam-Tu Nguyen
+ * ncamtu@gmail.com
+ * College of Technology
+ * Vietnam National University, Hanoi
+ *
+ * JGibbsLDA is a free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License,
+ * or (at your option) any later version.
+ *
+ * JGibbsLDA is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with JGibbsLDA; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ */
+package jgibblda;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.StringTokenizer;
+import java.util.zip.GZIPInputStream;
+import java.util.zip.GZIPOutputStream;
+
+import gnu.trove.list.array.TIntArrayList;
+import gnu.trove.map.hash.TIntObjectHashMap;
+
+public class Model {
+
+ //---------------------------------------------------------------
+ // Class Variables
+ //---------------------------------------------------------------
+
+ public static String tassignSuffix = ".tassign.gz"; // suffix for topic assignment file
+ public static String thetaSuffix = ".theta.gz"; // suffix for theta (topic - document distribution) file
+ public static String phiSuffix = ".phi.gz"; // suffix for phi file (topic - word distribution) file
+ public static String othersSuffix = ".others.gz"; // suffix for containing other parameters
+ public static String twordsSuffix = ".twords.gz"; // suffix for file containing words-per-topics
+ public static String wordMapSuffix = ".wordmap.gz"; // suffix for file containing word to id map
+
+ //---------------------------------------------------------------
+ // Model Parameters and Variables
+ //---------------------------------------------------------------
+
+
+ public String dir = "./";
+ public String dfile = "trndocs.dat";
+ public boolean unlabeled = false;
+ public String modelName = "model";
+ public LDADataset data; // link to a dataset
+
+ public int M = 0; // dataset size (i.e., number of docs)
+ public int V = 0; // vocabulary size
+ public int K = 100; // number of topics
+ public double alpha; // LDA hyperparameters
+ public double beta = 0.01; // LDA hyperparameters
+ public int niters = 1000; // number of Gibbs sampling iteration
+ public int nburnin = 500; // number of Gibbs sampling burn-in iterations
+ public int samplingLag = 5;// Gibbs sampling sample lag
+ public int numSamples = 1; // number of samples taken
+ public int liter = 0; // the iteration at which the model was saved
+ public int twords = 20; // print out top words per each topic
+
+ // Estimated/Inferenced parameters
+ public double[][] theta = null; // theta: document - topic distributions, size M x K
+ public double[][] phi = null; // phi: topic-word distributions, size K x V
+
+ // Temp variables while sampling
+ public TIntArrayList[] z = null; // topic assignments for words, size M x doc.size()
+ protected int[][] nw = null; // nw[i][j]: number of instances of word/term i assigned to topic j, size V x K
+ protected int[][] nd = null; // nd[i][j]: number of words in document i assigned to topic j, size M x K
+ protected int[] nwsum = null; // nwsum[j]: total number of words assigned to topic j, size K
+ protected int[] ndsum = null; // ndsum[i]: total number of words in document i, size M
+
+ protected ArrayList> nw_inf = null; // nw[m][i][j]: number of instances of word/term i assigned to topic j in doc m, size M x V x K
+ protected int[][] nwsum_inf = null; // nwsum[m][j]: total number of words assigned to topic j in doc m, size M x K
+
+ // temp variables for sampling
+ protected double[] p = null;
+
+ //---------------------------------------------------------------
+ // Constructors
+ //---------------------------------------------------------------
+
+ public Model(LDACmdOption option) throws FileNotFoundException, IOException
+ {
+ this(option, null);
+ }
+
+ public Model(LDACmdOption option, Model trnModel) throws FileNotFoundException, IOException
+ {
+ modelName = option.modelName;
+ K = option.K;
+
+ alpha = option.alpha;
+ if (alpha < 0.0)
+ alpha = 50.0 / K;
+
+ if (option.beta >= 0)
+ beta = option.beta;
+
+ niters = option.niters;
+ nburnin = option.nburnin;
+ samplingLag = option.samplingLag;
+
+ dir = option.dir;
+ if (dir.endsWith(File.separator))
+ dir = dir.substring(0, dir.length() - 1);
+
+ dfile = option.dfile;
+ unlabeled = option.unlabeled;
+ twords = option.twords;
+
+ // initialize dataset
+ data = new LDADataset();
+
+ // process trnModel (if given)
+ if (trnModel != null) {
+ data.setDictionary(trnModel.data.localDict);
+ K = trnModel.K;
+
+ // use hyperparameters from model (if not overridden in options)
+ if (option.alpha < 0.0)
+ alpha = trnModel.alpha;
+ if (option.beta < 0.0)
+ beta = trnModel.beta;
+ }
+
+ // read in data
+ data.readDataSet(dir + File.separator + dfile, unlabeled);
+ }
+
+ //---------------------------------------------------------------
+ // Init Methods
+ //---------------------------------------------------------------
+
+ /**
+ * Init parameters for estimation or inference
+ */
+ public boolean init(boolean random)
+ {
+ if (random) {
+ M = data.M;
+ V = data.V;
+ z = new TIntArrayList[M];
+ } else {
+ if (!loadModel()) {
+ System.out.println("Fail to load word-topic assignment file of the model!");
+ return false;
+ }
+
+ // debug output
+ System.out.println("Model loaded:");
+ System.out.println("\talpha:" + alpha);
+ System.out.println("\tbeta:" + beta);
+ System.out.println("\tK:" + K);
+ System.out.println("\tM:" + M);
+ System.out.println("\tV:" + V);
+ }
+
+ p = new double[K];
+
+ initSS();
+
+ for (int m = 0; m < data.M; m++){
+ if (random) {
+ z[m] = new TIntArrayList();
+ }
+
+ // initilize for z
+ int N = data.docs.get(m).length;
+ for (int n = 0; n < N; n++){
+ int w = data.docs.get(m).words[n];
+ int topic;
+
+ // random init a topic or load existing topic from z[m]
+ if (random) {
+ topic = (int)Math.floor(Math.random() * K);
+ z[m].add(topic);
+ } else {
+ topic = z[m].get(n);
+ }
+
+ nw[w][topic]++; // number of instances of word assigned to topic j
+ nd[m][topic]++; // number of words in document i assigned to topic j
+ nwsum[topic]++; // total number of words assigned to topic j
+ }
+
+ ndsum[m] = N; // total number of words in document i
+ }
+
+ theta = new double[M][K];
+ phi = new double[K][V];
+
+ return true;
+ }
+
+ public boolean initInf()
+ {
+ nw_inf = new ArrayList>();
+
+ nwsum_inf = new int[M][K];
+ for (int m = 0; m < M; m++) {
+ for (int k = 0; k < K; k++) {
+ nwsum_inf[m][k] = 0;
+ }
+ }
+
+ for (int m = 0; m < data.M; m++){
+ nw_inf.add(m, new TIntObjectHashMap());
+
+ // initilize for z
+ int N = data.docs.get(m).length;
+ for (int n = 0; n < N; n++){
+ int w = data.docs.get(m).words[n];
+ int topic = z[m].get(n);
+
+ if (!nw_inf.get(m).containsKey(w)) {
+ int[] nw_inf_m_w = new int[K];
+ for (int k = 0; k < K; k++) {
+ nw_inf_m_w[k] = 0;
+ }
+ nw_inf.get(m).put(w, nw_inf_m_w);
+ }
+
+ nw_inf.get(m).get(w)[topic]++; // number of instances of word assigned to topic j in doc m
+ //nw_inf[m][w][topic]++; // number of instances of word assigned to topic j in doc m
+ nwsum_inf[m][topic]++; // total number of words assigned to topic j in doc m
+ }
+ }
+
+ return true;
+ }
+
+ /**
+ * Init sufficient stats
+ */
+ protected void initSS()
+ {
+ nw = new int[V][K];
+ for (int w = 0; w < V; w++){
+ for (int k = 0; k < K; k++){
+ nw[w][k] = 0;
+ }
+ }
+
+ nd = new int[M][K];
+ for (int m = 0; m < M; m++){
+ for (int k = 0; k < K; k++){
+ nd[m][k] = 0;
+ }
+ }
+
+ nwsum = new int[K];
+ for (int k = 0; k < K; k++){
+ nwsum[k] = 0;
+ }
+
+ ndsum = new int[M];
+ for (int m = 0; m < M; m++){
+ ndsum[m] = 0;
+ }
+ }
+
+ //---------------------------------------------------------------
+ // Update Methods
+ //---------------------------------------------------------------
+
+ public void updateParams()
+ {
+ updateTheta();
+ updatePhi();
+ numSamples++;
+ }
+ public void updateParams(Model trnModel)
+ {
+ updateTheta();
+ updatePhi(trnModel);
+ numSamples++;
+ }
+
+ public void updateTheta()
+ {
+ double Kalpha = K * alpha;
+ for (int m = 0; m < M; m++) {
+ for (int k = 0; k < K; k++) {
+ if (numSamples > 1) theta[m][k] *= numSamples - 1; // convert from mean to sum
+ theta[m][k] += (nd[m][k] + alpha) / (ndsum[m] + Kalpha);
+ if (numSamples > 1) theta[m][k] /= numSamples; // convert from sum to mean
+ }
+ }
+ }
+
+ public void updatePhi()
+ {
+ double Vbeta = V * beta;
+ for (int k = 0; k < K; k++) {
+ for (int w = 0; w < V; w++) {
+ if (numSamples > 1) phi[k][w] *= numSamples - 1; // convert from mean to sum
+ phi[k][w] += (nw[w][k] + beta) / (nwsum[k] + Vbeta);
+ if (numSamples > 1) phi[k][w] /= numSamples; // convert from sum to mean
+ }
+ }
+ }
+
+ // for inference
+ public void updatePhi(Model trnModel)
+ {
+ double Vbeta = trnModel.V * beta;
+ for (int k = 0; k < K; k++) {
+ for (int _w = 0; _w < V; _w++) {
+ if (data.lid2gid.containsKey(_w)) {
+ int id = data.lid2gid.get(_w);
+
+ if (numSamples > 1) phi[k][_w] *= numSamples - 1; // convert from mean to sum
+ phi[k][_w] += (trnModel.nw[id][k] + nw[_w][k] + beta) / (trnModel.nwsum[k] + nwsum[k] + Vbeta);
+ if (numSamples > 1) phi[k][_w] /= numSamples; // convert from sum to mean
+ } // else ignore words that don't appear in training
+ } //end foreach word
+ } // end foreach topic
+ }
+
+ //---------------------------------------------------------------
+ // I/O Methods
+ //---------------------------------------------------------------
+
+ /**
+ * Save model
+ */
+ public boolean saveModel()
+ {
+ return saveModel("");
+ }
+ public boolean saveModel(String modelPrefix)
+ {
+ if (!saveModelTAssign(dir + File.separator + modelPrefix + modelName + tassignSuffix)) {
+ return false;
+ }
+
+ if (!saveModelOthers(dir + File.separator + modelPrefix + modelName + othersSuffix)) {
+ return false;
+ }
+
+ if (!saveModelTheta(dir + File.separator + modelPrefix + modelName + thetaSuffix)) {
+ return false;
+ }
+
+ //if (!saveModelPhi(dir + File.separator + modelPrefix + modelName + phiSuffix)) {
+ // return false;
+ //}
+
+ if (twords > 0) {
+ if (!saveModelTwords(dir + File.separator + modelPrefix + modelName + twordsSuffix)) {
+ return false;
+ }
+ }
+
+ if (!data.localDict.writeWordMap(dir + File.separator + modelPrefix + modelName + wordMapSuffix)) {
+ return false;
+ }
+
+ return true;
+ }
+
+ /**
+ * Save word-topic assignments for this model
+ */
+ public boolean saveModelTAssign(String filename) {
+ int i, j;
+
+ try{
+ BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
+ new GZIPOutputStream(
+ new FileOutputStream(filename)), "UTF-8"));
+
+ //write docs with topic assignments for words
+ for (i = 0; i < data.M; i++) {
+ for (j = 0; j < data.docs.get(i).length; ++j) {
+ writer.write(data.docs.get(i).words[j] + ":" + z[i].get(j) + " ");
+ }
+ writer.write("\n");
+ }
+
+ writer.close();
+ }
+ catch (Exception e) {
+ System.out.println("Error while saving model tassign: " + e.getMessage());
+ e.printStackTrace();
+ return false;
+ }
+ return true;
+ }
+
+ /**
+ * Save theta (topic distribution) for this model
+ */
+ public boolean saveModelTheta(String filename) {
+ try{
+ BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
+ new GZIPOutputStream(
+ new FileOutputStream(filename)), "UTF-8"));
+
+ for (int i = 0; i < M; i++) {
+ for (int j = 0; j < K; j++) {
+ if (theta[i][j] > 0) {
+ writer.write(j + ":" + theta[i][j] + " ");
+ }
+ }
+ writer.write("\n");
+ }
+ writer.close();
+ }
+ catch (Exception e){
+ System.out.println("Error while saving topic distribution file for this model: " + e.getMessage());
+ e.printStackTrace();
+ return false;
+ }
+ return true;
+ }
+
+ /**
+ * Save word-topic distribution
+ */
+ public boolean saveModelPhi(String filename)
+ {
+ try {
+ BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
+ new GZIPOutputStream(
+ new FileOutputStream(filename)), "UTF-8"));
+
+ for (int i = 0; i < K; i++) {
+ for (int j = 0; j < V; j++) {
+ if (phi[i][j] > 0) {
+ writer.write(j + ":" + phi[i][j] + " ");
+ }
+ }
+ writer.write("\n");
+ }
+ writer.close();
+ }
+ catch (Exception e) {
+ System.out.println("Error while saving word-topic distribution:" + e.getMessage());
+ e.printStackTrace();
+ return false;
+ }
+ return true;
+ }
+
+ /**
+ * Save other information of this model
+ */
+ public boolean saveModelOthers(String filename){
+ try{
+ BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
+ new GZIPOutputStream(
+ new FileOutputStream(filename)), "UTF-8"));
+
+ writer.write("alpha=" + alpha + "\n");
+ writer.write("beta=" + beta + "\n");
+ writer.write("ntopics=" + K + "\n");
+ writer.write("ndocs=" + M + "\n");
+ writer.write("nwords=" + V + "\n");
+ writer.write("liters=" + liter + "\n");
+
+ writer.close();
+ }
+ catch(Exception e){
+ System.out.println("Error while saving model others:" + e.getMessage());
+ e.printStackTrace();
+ return false;
+ }
+ return true;
+ }
+
+ /**
+ * Save model the most likely words for each topic
+ */
+ public boolean saveModelTwords(String filename){
+ try{
+ BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
+ new GZIPOutputStream(
+ new FileOutputStream(filename)), "UTF-8"));
+
+ if (twords > V){
+ twords = V;
+ }
+
+ for (int k = 0; k < K; k++){
+ ArrayList wordsProbsList = new ArrayList();
+ for (int w = 0; w < V; w++){
+ Pair p = new Pair(w, phi[k][w], false);
+
+ wordsProbsList.add(p);
+ }//end foreach word
+
+ //print topic
+ writer.write("Topic " + k + ":\n");
+ Collections.sort(wordsProbsList);
+
+ for (int i = 0; i < twords; i++){
+ if (data.localDict.contains((Integer)wordsProbsList.get(i).first)){
+ String word = data.localDict.getWord((Integer)wordsProbsList.get(i).first);
+
+ writer.write("\t" + word + "\t" + wordsProbsList.get(i).second + "\n");
+ }
+ }
+ } //end foreach topic
+
+ writer.close();
+ }
+ catch(Exception e){
+ System.out.println("Error while saving model twords: " + e.getMessage());
+ e.printStackTrace();
+ return false;
+ }
+ return true;
+ }
+
+ /**
+ * Load saved model
+ */
+ public boolean loadModel(){
+ if (!readOthersFile(dir + File.separator + modelName + othersSuffix))
+ return false;
+
+ if (!readTAssignFile(dir + File.separator + modelName + tassignSuffix))
+ return false;
+
+ // read dictionary
+ Dictionary dict = new Dictionary();
+ if (!dict.readWordMap(dir + File.separator + modelName + wordMapSuffix))
+ return false;
+
+ data.localDict = dict;
+
+ return true;
+ }
+
+ /**
+ * Load "others" file to get parameters
+ */
+ protected boolean readOthersFile(String otherFile){
+ try {
+ BufferedReader reader = new BufferedReader(new InputStreamReader(
+ new GZIPInputStream(
+ new FileInputStream(otherFile)), "UTF-8"));
+ String line;
+ while((line = reader.readLine()) != null){
+ StringTokenizer tknr = new StringTokenizer(line,"= \t\r\n");
+
+ int count = tknr.countTokens();
+ if (count != 2)
+ continue;
+
+ String optstr = tknr.nextToken();
+ String optval = tknr.nextToken();
+
+ if (optstr.equalsIgnoreCase("alpha")){
+ alpha = Double.parseDouble(optval);
+ }
+ else if (optstr.equalsIgnoreCase("beta")){
+ beta = Double.parseDouble(optval);
+ }
+ else if (optstr.equalsIgnoreCase("ntopics")){
+ K = Integer.parseInt(optval);
+ }
+ else if (optstr.equalsIgnoreCase("liter")){
+ liter = Integer.parseInt(optval);
+ }
+ else if (optstr.equalsIgnoreCase("nwords")){
+ V = Integer.parseInt(optval);
+ }
+ else if (optstr.equalsIgnoreCase("ndocs")){
+ M = Integer.parseInt(optval);
+ }
+ else {
+ // any more?
+ }
+ }
+
+ reader.close();
+ }
+ catch (Exception e){
+ System.out.println("Error while reading other file:" + e.getMessage());
+ e.printStackTrace();
+ return false;
+ }
+ return true;
+ }
+
+ /**
+ * Load word-topic assignments for this model
+ */
+ protected boolean readTAssignFile(String tassignFile)
+ {
+ try {
+ int i,j;
+ BufferedReader reader = new BufferedReader(new InputStreamReader(
+ new GZIPInputStream(
+ new FileInputStream(tassignFile)), "UTF-8"));
+
+ String line;
+ z = new TIntArrayList[M];
+ data = new LDADataset();
+ data.setM(M);
+ data.V = V;
+ for (i = 0; i < M; i++){
+ line = reader.readLine();
+ StringTokenizer tknr = new StringTokenizer(line, " \t\r\n");
+
+ int length = tknr.countTokens();
+
+ TIntArrayList words = new TIntArrayList();
+ TIntArrayList topics = new TIntArrayList();
+ for (j = 0; j < length; j++){
+ String token = tknr.nextToken();
+
+ StringTokenizer tknr2 = new StringTokenizer(token, ":");
+ if (tknr2.countTokens() != 2){
+ System.out.println("Invalid word-topic assignment line\n");
+ return false;
+ }
+
+ words.add(Integer.parseInt(tknr2.nextToken()));
+ topics.add(Integer.parseInt(tknr2.nextToken()));
+ }//end for each topic assignment
+
+ //allocate and add new document to the corpus
+ Document doc = new Document(words);
+ data.setDoc(doc, i);
+
+ //assign values for z
+ z[i] = new TIntArrayList();
+ for (j = 0; j < topics.size(); j++){
+ z[i].add(topics.get(j));
+ }
+
+ }//end for each doc
+
+ reader.close();
+ }
+ catch (Exception e){
+ System.out.println("Error while loading model: " + e.getMessage());
+ e.printStackTrace();
+ return false;
+ }
+ return true;
+ }
+}
diff --git a/java_LabledLDA/src/jgibblda/Pair.java b/java_LabledLDA/src/jgibblda/Pair.java
new file mode 100644
index 0000000..0d4d4cb
--- /dev/null
+++ b/java_LabledLDA/src/jgibblda/Pair.java
@@ -0,0 +1,54 @@
+/*
+ * Copyright (C) 2007 by
+ *
+ * Xuan-Hieu Phan
+ * hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com
+ * Graduate School of Information Sciences
+ * Tohoku University
+ *
+ * Cam-Tu Nguyen
+ * ncamtu@gmail.com
+ * College of Technology
+ * Vietnam National University, Hanoi
+ *
+ * JGibbsLDA is a free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License,
+ * or (at your option) any later version.
+ *
+ * JGibbsLDA is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with JGibbsLDA; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ */
+
+package jgibblda;
+
+import java.util.Comparator;
+
+public class Pair implements Comparable {
+ public Object first;
+ public Comparable second;
+ public static boolean naturalOrder = false;
+
+ public Pair(Object k, Comparable v){
+ first = k;
+ second = v;
+ }
+
+ public Pair(Object k, Comparable v, boolean naturalOrder){
+ first = k;
+ second = v;
+ Pair.naturalOrder = naturalOrder;
+ }
+
+ public int compareTo(Pair p){
+ if (naturalOrder)
+ return this.second.compareTo(p.second);
+ else return -this.second.compareTo(p.second);
+ }
+}
diff --git a/old/preprocessing.py b/old/preprocessing.py
new file mode 100644
index 0000000..6bd8c3e
--- /dev/null
+++ b/old/preprocessing.py
@@ -0,0 +1,466 @@
+# -*- coding: utf-8 -*-
+import csv
+import random
+import sys
+
+import spacy
+import textacy
+
+"""
+import keras
+import numpy as np
+from keras.layers import Dense, SimpleRNN, LSTM, TimeDistributed, Dropout
+from keras.models import Sequential
+import keras.backend as K
+"""
+csv.field_size_limit(sys.maxsize)
+
+"""
+def getFirstSynonym(word, thesaurus_gen):
+
+ word = word.lower()
+ # TODO word cleaning https://stackoverflow.com/questions/3939361/remove-specific-characters-from-a-string-in-python
+
+
+ # durch den thesaurrus iterieren
+ for syn_block in thesaurus_gen: # syn_block ist eine liste mit Synonymen
+
+ # durch den synonymblock iterieren
+ for syn in syn_block:
+ syn = syn.lower().split(" ") if not re.match(r'\A[\w-]+\Z', syn) else syn # aus synonym mach liste (um evtl. sätze zu identifieziren)
+
+ # falls das wort in dem synonym enthalten ist (also == einem Wort in der liste ist)
+ if word in syn:
+
+ # Hauptform suchen
+ if "auptform" in syn:
+ # nicht ausgeben, falls es in Klammern steht
+ for w in syn:
+ if not re.match(r'\([^)]+\)', w) and w is not None:
+ return w
+
+ # falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
+ if len(syn) == 1:
+ w = syn[0]
+ if not re.match(r'\([^)]+\)', w) and w is not None:
+ return w
+
+ return word # zur Not die eingabe ausgeben
+
+
+"""
+"""
+def cleanText(string,custom_stopwords=None, custom_symbols=None, custom_words=None, customPreprocessing=None, lemmatize=False, normalize_synonyms=False):
+
+ # use preprocessing
+ if customPreprocessing is not None:
+ string = customPreprocessing(string)
+
+
+
+ if custom_stopwords is not None:
+ custom_stopwords = custom_stopwords
+ else:
+ custom_stopwords = []
+
+ if custom_words is not None:
+ custom_words = custom_words
+ else:
+ custom_words = []
+
+ if custom_symbols is not None:
+ custom_symbols = custom_symbols
+ else:
+ custom_symbols = []
+
+
+ # custom stoplist
+ # https://stackoverflow.com/questions/9806963/how-to-use-pythons-import-function-properly-import
+ stop_words = __import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS
+
+ stoplist =list(stop_words) + custom_stopwords
+ # List of symbols we don't care about either
+ symbols = ["-----","---","...","“","”",".","-","<",">",",","?","!","..","n’t","n't","|","||",";",":","…","’s","'s",".","(",")","[","]","#"] + custom_symbols
+
+
+
+ # get rid of newlines
+ string = string.strip().replace("\n", " ").replace("\r", " ")
+
+ # replace twitter
+ mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
+ string = mentionFinder.sub("MENTION", string)
+
+ # replace emails
+ emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
+ string = emailFinder.sub("EMAIL", string)
+
+ # replace urls
+ urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
+ string = urlFinder.sub("URL", string)
+
+ # replace HTML symbols
+ string = string.replace("&", "and").replace(">", ">").replace("<", "<")
+
+
+
+
+ # parse with spaCy
+ spacy_doc = PARSER(string)
+ tokens = []
+
+ added_entities = ["WORK_OF_ART","ORG","PRODUCT", "LOC"]#,"PERSON"]
+ added_POS = ["NOUN"]#, "NUM" ]#,"VERB","ADJ"] #IDEE NUM mit in den Corpus aufnehmen, aber fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013
+
+ # append Tokens to a list
+ for tok in spacy_doc:
+ if tok.pos_ in added_POS:
+ if lemmatize:
+ tokens.append(tok.lemma_.lower().strip())
+ else:
+ tokens.append(tok.text.lower().strip())
+
+ # add entities
+ if tok.ent_type_ in added_entities:
+ tokens.append(tok.text.lower())
+
+
+
+ # remove stopwords
+ tokens = [tok for tok in tokens if tok not in stoplist]
+
+ # remove symbols
+ tokens = [tok for tok in tokens if tok not in symbols]
+
+ # remove custom_words
+ tokens = [tok for tok in tokens if tok not in custom_words]
+
+ # remove single characters
+ tokens = [tok for tok in tokens if len(tok)>1]
+
+ # remove large strings of whitespace
+ remove_large_strings_of_whitespace(" ".join(tokens))
+
+
+ #idee abkürzungen auflösen (v.a. TU -> Technische Universität)
+
+ if normalize_synonyms:
+ tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens]
+
+ return " ".join(tokens)
+
+
+def remove_large_strings_of_whitespace(sentence):
+
+ whitespaceFinder = re.compile(r'(\r\n|\r|\n)', re.IGNORECASE)
+ sentence = whitespaceFinder.sub(" ", sentence)
+
+ tokenlist = sentence.split(" ")
+
+ while "" in tokenlist:
+ tokenlist.remove("")
+ while " " in tokenlist:
+ tokenlist.remove(" ")
+
+ return " ".join(tokenlist)
+"""
+"""
+def generateFromXML(path2xml, textfield='Beschreibung', clean=False, normalize_Synonyms=False,lemmatize=False):
+ import xml.etree.ElementTree as ET
+
+ tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
+ root = tree.getroot()
+
+ for ticket in root:
+ metadata = {}
+ text = "ERROR"
+ for field in ticket:
+ if field.tag == textfield:
+ if clean:
+ text = cleanText_words(field.text,PARSER,normalize_synonyms=normalize_Synonyms,lemmatize=lemmatize)
+ else:
+ text = field.text
+ else:
+ #idee hier auch cleanen?
+ metadata[field.tag] = field.text
+ yield text, metadata
+"""
+
+
+LANGUAGE = 'de'
+#PARSER = de_core_news_md.load()
+PARSER = spacy.load(LANGUAGE)
+
+from old.textCleaning import TextCleaner
+
+cleaner = TextCleaner(parser=PARSER)
+
+
+def generateTextfromTicketXML(path2xml, textfield='Beschreibung', clean=False, normalize_Synonyms=False, lemmatize=False):
+ import xml.etree.ElementTree as ET
+
+ tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
+ root = tree.getroot()
+
+
+ for ticket in root:
+ text = "ERROR"
+ for field in ticket:
+ if field.tag == textfield:
+ if clean:
+ text = cleaner.normalizeSynonyms(cleaner.removeWords(cleaner.keepPOSandENT(field.text))) #,normalize_synonyms=normalize_Synonyms,lemmatize=lemmatize)
+ else:
+ text = field.text
+ yield text
+
+def generateMetadatafromTicketXML(path2xml, textfield='Beschreibung'):#,keys_to_clean=["Loesung","Zusammenfassung"]):
+ import xml.etree.ElementTree as ET
+
+ tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
+
+ root = tree.getroot()
+
+ for ticket in root:
+ metadata = {}
+ for field in ticket:
+ if field.tag != textfield:
+ if field.tag == "Zusammenfassung":
+ metadata[field.tag] = cleaner.removePunctuation(field.text)
+ elif field.tag == "Loesung":
+ metadata[field.tag] = cleaner.removeWhitespace(field.text)
+ else:
+ metadata[field.tag] = field.text
+
+ yield metadata
+
+
+
+
+"""
+def cleanText_symbols(string, parser=PARSER, custom_symbols=None, keep=None):
+
+ if custom_symbols is not None:
+ custom_symbols = custom_symbols
+ else:
+ custom_symbols = []
+
+ if keep is not None:
+ keep = keep
+ else:
+ keep = []
+
+ # List of symbols we don't care about
+ symbols = ["-----","---","...","“","”",".","-","<",">",",","?","!","..","n’t","n't","|","||",";",":","…","’s","'s",".","(",")","[","]","#"] + custom_symbols
+
+ # parse with spaCy
+ spacy_doc = parser(string)
+ tokens = []
+
+ pos = ["NUM", "SPACE", "PUNCT"]
+ for p in keep:
+ pos.remove(p)
+
+
+ # append Tokens to a list
+ for tok in spacy_doc:
+ if tok.pos_ not in pos and tok.text not in symbols:
+ tokens.append(tok.text)
+
+ return " ".join(tokens)
+
+def cleanText_words(string,parser=PARSER, custom_stopwords=None, custom_words=None, customPreprocessing=cleanText_symbols, lemmatize=False, normalize_synonyms=False):
+
+ # use preprocessing
+ if customPreprocessing is not None:
+ string = customPreprocessing(string)
+
+ if custom_stopwords is not None:
+ custom_stopwords = custom_stopwords
+ else:
+ custom_stopwords = []
+
+ if custom_words is not None:
+ custom_words = custom_words
+ else:
+ custom_words = []
+
+
+ # custom stoplist
+ # https://stackoverflow.com/questions/9806963/how-to-use-pythons-import-function-properly-import
+ stop_words = __import__("spacy." + parser.lang, globals(), locals(), ['object']).STOP_WORDS
+
+ stoplist =list(stop_words) + custom_stopwords
+
+ # replace twitter
+ mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
+ string = mentionFinder.sub("MENTION", string)
+
+ # replace emails
+ emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
+ string = emailFinder.sub("EMAIL", string)
+
+ # replace urls
+ urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
+ string = urlFinder.sub("URL", string)
+
+ # replace HTML symbols
+ string = string.replace("&", "and").replace(">", ">").replace("<", "<")
+
+
+
+ # parse with spaCy
+ spacy_doc = parser(string)
+ tokens = []
+
+ added_entities = ["WORK_OF_ART","ORG","PRODUCT", "LOC"]#,"PERSON"]
+ added_POS = ["NOUN"]#, "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013
+
+ # append Tokens to a list
+ for tok in spacy_doc:
+ if tok.pos_ in added_POS:
+ if lemmatize:
+ tokens.append(tok.lemma_.lower().strip())
+ else:
+ tokens.append(tok.text.lower().strip())
+
+ # add entities
+ if tok.ent_type_ in added_entities:
+ tokens.append(tok.text.lower())
+
+
+
+ # remove stopwords
+ tokens = [tok for tok in tokens if tok not in stoplist]
+
+ # remove custom_words
+ tokens = [tok for tok in tokens if tok not in custom_words]
+
+ # remove single characters
+ tokens = [tok for tok in tokens if len(tok)>1]
+
+ # remove large strings of whitespace
+ #remove_whitespace(" ".join(tokens))
+
+
+ #idee abkürzungen auflösen (v.a. TU -> Technische Universität): abkürzungsverezeichnis
+
+ if normalize_synonyms:
+ tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens]
+
+ return " ".join(set(tokens))
+
+def cleanText_removeWhitespace(sentence):
+ whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE)
+ sentence = whitespaceFinder.sub(" ", sentence)
+ return sentence
+
+#todo: preprocess pipe: removewhitespace, removePUNCT, resolveAbk, keepPOS, keepEnt, removeWords, normalizeSynonyms
+
+
+def getFirstSynonym(word, thesaurus_gen):
+
+ word = word.lower()
+
+
+ # durch den thesaurrus iterieren
+ for syn_block in thesaurus_gen: # syn_block ist eine liste mit Synonymen
+
+ for syn in syn_block:
+ syn = syn.lower()
+ if re.match(r'\A[\w-]+\Z', syn): # falls syn einzelwort ist
+ if word == syn:
+ return getHauptform(syn_block, word)
+ else: # falls es ein satz ist
+ if word in syn:
+ return getHauptform(syn_block, word)
+ return word # zur Not, das ursrpüngliche Wort zurückgeben
+
+def getHauptform(syn_block, word, default_return_first_Syn=False):
+
+ for syn in syn_block:
+ syn = syn.lower()
+
+ if "hauptform" in syn and len(syn.split(" ")) <= 2:
+ # nicht ausgeben, falls es in Klammern steht
+ for w in syn.split(" "):
+ if not re.match(r'\([^)]+\)', w):
+ return w
+
+ if default_return_first_Syn:
+ # falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
+ for w in syn_block:
+ if not re.match(r'\([^)]+\)', w):
+ return w
+ return word # zur Not, das ursrpüngliche Wort zurückgeben
+"""
+
+def printRandomDoc(textacyCorpus):
+ print()
+
+ print("len(textacyCorpus) = %i" % len(textacyCorpus))
+ randIndex = int((len(textacyCorpus) - 1) * random.random())
+ print("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata))
+
+ print()
+
+####################'####################'####################'####################'####################'##############
+# todo config-file
+
+DATAPATH = "ticketSamples.xml"
+DATAPATH_thesaurus = "openthesaurus.csv"
+
+
+
+normalize_Synonyms = True
+clean = True
+lemmatize = True
+
+custom_words = ["grüßen", "fragen"]
+
+####################'####################'####################'####################'####################'##############
+
+
+## files to textacy-corpus
+textacyCorpus = textacy.Corpus(PARSER)
+
+print("add texts to textacy-corpus...")
+textacyCorpus.add_texts(texts=generateTextfromTicketXML(DATAPATH, normalize_Synonyms=normalize_Synonyms, clean=clean, lemmatize=lemmatize), metadatas=generateMetadatafromTicketXML(DATAPATH))
+
+
+#for txt, dic in generateFromXML(DATAPATH, normalize_Synonyms=normalize_Synonyms, clean=clean, lemmatize=lemmatize):
+# textacyCorpus.add_text(txt,dic)
+
+
+
+for doc in textacyCorpus:
+ print(doc.metadata)
+ print(doc.text)
+
+#print(textacyCorpus[2].text)
+#printRandomDoc(textacyCorpus)
+#print(textacyCorpus[len(textacyCorpus)-1].text)
+
+
+print()
+print()
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/old/test.py b/old/test.py
new file mode 100644
index 0000000..fc2ee00
--- /dev/null
+++ b/old/test.py
@@ -0,0 +1,213 @@
+# -*- coding: utf-8 -*-
+import spacy
+import textacy
+from spacy.tokens import Doc
+
+# -*- coding: utf-8 -*-
+import re
+import spacy
+import functools
+
+import textacy
+
+
+class TextCleaner:
+
+ def __init__(self, parser, thesaurus=None, customClass_symbols=None, customClass_words=None, keep4All=None):
+ """
+ :param parser: spacy-parser
+ :param thesaurus: [[syn1, syn2, ...],[syn1, syn2, ...], ...]
+ :param customClass_symbols:[str]
+ :param customClass_words:[str]
+ :param customClassPOS:[str]
+ :param keep4All: [str]
+ """
+ if thesaurus is None:
+ DATAPATH_thesaurus = "openthesaurus.csv"
+
+ ## !!!!!! list wichtig, da sonst nicht die gleichen Synonyme zurückgegeben werden, weil ein generator während der laufzeit pickt
+ self.thesaurus = list(textacy.fileio.read_csv(DATAPATH_thesaurus, delimiter=";"))
+ else:
+ self.thesaurus = thesaurus
+
+ self.parser = parser
+
+ #self.whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE)
+ self.mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
+ self.emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
+ self.urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
+
+ # to keep
+ self.entities2keep = ["WORK_OF_ART", "ORG", "PRODUCT", "LOC"] # ,"PERSON"]
+ self.pos2keep = ["NOUN"] # , "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013
+
+ """
+
+ # to remove
+ self.symbols = ["-----", "---", "...", "“", "”", ".", "-", "<", ">", ",", "?", "!", "..", "n’t", "n't", "|", "||",
+ ";", ":",
+ "…", "’s", "'s", ".", "(", ")", "[", "]", "#"] + (customClass_symbols if customClass_symbols is not None else [])
+ self.stop_words = list(__import__("spacy." + self.parser.lang, globals(), locals(), ['object']).STOP_WORDS)+ (customClass_words if customClass_words is not None else [])
+
+
+ self.entities2keep = self.entities2keep + (keep4All if keep4All is not None else [])
+ self.pos2keep = self.pos2keep + (keep4All if keep4All is not None else [])
+
+
+ keep = (keep4All if hasattr(keep4All, '__iter__') else []) + self.pos2keep + self.entities2keep
+
+
+ # modify those to remove with those to keep
+ for sym in keep:
+ try:
+ self.symbols.remove(sym)
+ except ValueError:
+ pass
+ for sym in keep:
+ try:
+ self.stop_words.remove(sym)
+ except ValueError:
+ pass
+ """
+
+ def loadString(self,string):
+ self.currentDoc = self.parser(string)
+
+
+ def removeWhitespace(self, string):
+ return " ".join([tok.text for tok in self.currentDoc if not tok.is_space])
+
+
+ def removePunctuation(self, string, custom_symbols=None, keep=None):
+ symbols = self.symbols + (custom_symbols if custom_symbols is not None else [])
+ if hasattr(keep, '__iter__'):
+ for k in keep:
+ try:
+ symbols.remove(k)
+ except ValueError:
+ pass
+
+ return " ".join([tok.text for tok in self.currentDoc if not tok.is_punct and tok.text not in symbols])
+
+
+def cleanDoc(doc, toDelete=None, toKeep=None):
+ """
+ :param doc: spacyDoc
+ :param toDelete: [str] pos_ , ent_type_ or tag_
+ :return: str tokenlist
+ """
+ #keep
+ tokenlist = []
+ for tok in doc:
+ if tok.pos_ in toKeep or tok.ent_type_ in toKeep or tok.tag_ in toKeep:
+ tokenlist.append(tok.text)
+
+ #delete
+ tokenlist = [tok.text for tok in doc if tok.pos_ in toDelete and not tok.ent_type_ in toDelete and not tok.tag_ in toDelete]
+
+ result = " ".join(tokenlist)
+ return result #problem: kein doc und daher nicht komponierbar
+
+
+def keepinDoc(doc, toKeep=None):
+ """
+ :param doc: spacyDoc
+ :param toDelete: [str]
+ :return: str tokenlist
+ """
+ return " ".join([tok.text for tok in doc if tok.pos_ in toKeep or tok.ent_type_ in toKeep or tok.tag_ in toKeep])
+
+
+# https://mathieularose.com/function-composition-in-python/
+parser = spacy.load('de')
+cleaner = TextCleaner(parser)
+corpus_raw = textacy.Corpus(parser)
+corpus_clean = textacy.Corpus(parser)
+
+def foo(doc, toKeep=None):
+
+ words = [tok.text for tok in doc if tok.pos_ in toKeep or tok.ent_type_ in toKeep or tok.tag_ in toKeep]
+ spaces = [True] * len(words)
+
+ return Doc(doc.vocab,words=words,spaces=spaces)
+
+def foo2(doc, toDelete=None):#, toKeep=None):
+ """
+ :param doc: spacyDoc
+ :param toDelete: [str] pos_ , ent_type_ or tag_
+ :return: str tokenlist
+ """
+ #keep
+ #tokenlist = [tok.text for tok in doc if tok.pos_ in toKeep or tok.ent_type_ in toKeep or tok.tag_ in toKeep]
+
+ #delete
+
+ words = [tok.text for tok in doc if tok.pos_ in toDelete and not tok.ent_type_ in toDelete and not tok.tag_ in toDelete]
+ spaces = [True] * len(words)
+
+ return Doc(doc.vocab, words=words, spaces=spaces)
+
+
+"""
+def compose(self,*functions):
+ return functools.reduce(lambda f, g: lambda x: f(g(x)), functions, lambda x: x)
+
+def composeo(*functions):
+ return functools.reduce(lambda f, g: lambda x: f(g(x)), functions)
+"""
+
+def double(a):
+ return a*2
+
+def add(a, b):
+ return a+b
+
+def compose(*functions):
+ def compose2(f, g):
+ return lambda x: f(g(x))
+ return functools.reduce(compose2, functions, lambda x: x)
+
+
+
+
+
+#pipeline = compose(removeFromDoc, cleaner.removeWhitespace, cleaner.loadString)
+"""
+def pipe1(string):
+ cleaner.loadString(string)
+ string = cleaner.removeWhitespace(string)
+ string = cleaner.removePunctuation(string)
+ return string
+"""
+
+def cleaningPipe(spacy_pipe, composition):
+ for doc in spacy_pipe:
+ yield composition(doc)
+
+
+pipeline = compose(
+ functools.partial(foo2, toDelete=["PUNCT", "SPACE"]),
+ functools.partial(foo, toKeep=["NOUN"]))
+
+
+string = "Frau Hinrichs überdenkt die tu Situation und 545453 macht ' dann neue Anträge. \n Dieses Ticket wird geschlossen \n \n test"
+
+doc = parser(string)
+
+#print(removeFromDoc(doc,toDelete=["PUNCT"]))
+
+print(pipeline(doc.text))
+
+
+
+for txt in cleaningPipe(parser.pipe([string]),pipeline):
+ print(txt)
+"""
+corpus_raw.add_text(string)
+for doc in parser.pipe([string]):
+ doc.text = removeFromDoc(doc, toDelete=["PUNCT"])
+"""
+
+#corpus_clean.add_texts(cleaningPipe(parser.pipe([string]),pipeline))
+#print(corpus_raw[0].text)
+
diff --git a/old/testo.py b/old/testo.py
new file mode 100644
index 0000000..5a990f6
--- /dev/null
+++ b/old/testo.py
@@ -0,0 +1,199 @@
+# -*- coding: utf-8 -*-
+import functools
+import re
+
+import spacy
+import textacy
+from spacy.tokens import Doc
+from spacy.tagger import Tagger
+
+import xml.etree.ElementTree as ET
+
+PARSER = spacy.load('de')
+stop_words = list(__import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)
+
+def compose(*functions):
+ def compose2(f, g):
+ return lambda x: f(g(x))
+ return functools.reduce(compose2, functions, lambda x: x)
+
+
+def cleanTexts(textstream, parser, attr):
+
+ #input str-stream output str-stream
+ pipe = parser.pipe(textstream)
+
+ for doc in pipe:
+
+ tokens = [tok.text for tok in doc
+ if tok.pos_ not in attr
+ and tok.tag_ not in attr
+ and tok.ent_ not in attr
+ and tok.text not in attr
+ and tok.lower_ not in attr]
+
+
+ yield " ".join(tokens)
+
+
+ """
+def cleanDoc_lemmatize(doc,parser=PARSER):
+ return parser(" ".join([tok.lemma_ for tok in doc ]))
+
+
+def cleanDoc_STOPS(doc,parser=PARSER, stop_words=None, keep=None):
+ if stop_words is None:
+ stop_words = list(__import__("spacy." + parser.lang, globals(), locals(), ['object']).STOP_WORDS)
+
+ if hasattr(keep, '__iter__'):
+ for k in keep:
+ try:
+ stop_words.remove(k)
+ except ValueError:
+ pass
+
+ return parser(" ".join([tok.text for tok in doc if tok.text not in stop_words]))
+
+
+
+def cleanDoc_ENT(doc,parser=PARSER, keeponly=False, attr=["WORK_OF_ART", "ORG", "PRODUCT", "LOC"]):
+ if keeponly:
+ return parser(" ".join([tok.text for tok in doc if tok.ent_ in attr]))
+ else:
+ return parser(" ".join([tok.text for tok in doc if tok.ent_ not in attr]))
+
+
+
+def cleanDoc_POS(doc,parser=PARSER, keeponly=False, attr=["SPACE", "PUNCT"]):
+ if keeponly:
+ return parser(" ".join([tok.text for tok in doc if tok.pos_ in attr]))
+ else:
+ return parser(" ".join([tok.text for tok in doc if tok.pos_ not in attr]))
+"""
+
+
+def cleanTexts_POS(spacypipe, keeponly=False, attr=["SPACE", "PUNCT"]):
+ """
+ :param spacypipe: spacypipe
+ :param keeponly: bool . If True, only attr will be kept. If false, all attr will be deleted
+ :param attr: [str] pos_ or ent_type_
+ :yields: stream of strings: full-length cleaned text
+ """
+ if keeponly:
+ for doc in spacypipe:
+ yield " ".join([tok.text for tok in doc if tok.pos_ in attr])
+
+ else:
+ for doc in spacypipe:
+ yield " ".join([tok.text for tok in doc if tok.pos_ not in attr])
+
+def cleanText_POS(text,parser=PARSER, keeponly=False, attr=["SPACE", "PUNCT"]):
+ """
+ :param txt: str
+ :param keeponly: bool . If True, only attr will be kept. If false, all attr will be deleted
+ :param attr: [str] pos_ or ent_type_
+ :return: str
+ """
+ doc = parser(text)
+
+ if keeponly:
+ return " ".join([tok.text for tok in doc if tok.pos_ in attr])
+ else:
+ return " ".join([tok.text for tok in doc if tok.pos_ not in attr])
+
+
+def removeWhitespace(string):
+ return re.sub(r'(\r\n|\r|\n|(\s)+)', ' ', string)
+
+def removeWords(string, words):
+ big_regex = re.compile('|'.join(map(re.escape, words)))
+ return big_regex.sub("", string)
+
+
+
+
+
+
+def generateMainTextfromTicketXML(path2xml, main_textfield='Beschreibung', cleaning_function=None):
+ """
+ generates strings from XML
+ :param path2xml:
+ :param main_textfield:
+ :param cleaning_function:
+ :yields strings
+ """
+ import xml.etree.ElementTree as ET
+
+ tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
+ root = tree.getroot()
+
+
+ for ticket in root:
+ text = "ERROR"
+ for field in ticket:
+ if field.tag == main_textfield:
+ if cleaning_function:
+ text = cleaning_function(field.text)
+ else:
+ text = field.text
+ yield text
+
+def generateMetadatafromTicketXML(path2xml, key_function_pairs_to_clean, leave_out=['Beschreibung']):
+ import xml.etree.ElementTree as ET
+
+ tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
+
+ root = tree.getroot()
+
+ for ticket in root:
+ metadata = {}
+ for field in ticket:
+ if field.tag not in leave_out:
+
+ if field.tag in key_function_pairs_to_clean:
+ metadata[field.tag] = key_function_pairs_to_clean[field.tag](field.text)
+ else:
+ metadata[field.tag] = field.text
+
+ yield metadata
+
+
+
+
+string = "Frau Hinrichs überdenkt die tu Situation a@bc.de und 545453 macht ' dann neue Anträge. \n Dieses Ticket wird geschlossen \n \n test"
+
+#print(removeWords(string,["die", "neue"]))
+
+# in:str out:str
+cleanString = compose(
+ cleanText_POS,
+ functools.partial(textacy.preprocess.replace_emails, replace_with=u'EMAIL')
+)
+
+key_function_pairs_to_clean = {
+ "Loesung":removeWhitespace,
+ "Zusammenfassung":cleanText_POS
+}
+"""
+# in:str-gen out:str-gen
+cleanStream = compose(
+ removeSTOP,
+ lemmatize,
+ cleanEnt
+)
+"""
+# content: xml -> stringCleaning -> pipe -> docCleaning -> corpus
+# metadata:xml -> -> stringCleaning -> corpus
+
+corpus = textacy.Corpus(PARSER)
+
+
+
+
+corpus.add_texts(
+ cleanTexts(generateMainTextfromTicketXML("ticketSamples.xml"),PARSER,["PUNCT","SPACE","PERSON"])#,
+ #generateMetadatafromTicketXML("ticketSamples.xml",key_function_pairs_to_clean=key_function_pairs_to_clean)
+)
+
+print(corpus[0].text)
+
diff --git a/old/textCleaning.py b/old/textCleaning.py
new file mode 100644
index 0000000..da2fcd3
--- /dev/null
+++ b/old/textCleaning.py
@@ -0,0 +1,263 @@
+# -*- coding: utf-8 -*-
+import re
+import spacy
+import functools
+
+import textacy
+
+
+class TextCleaner:
+
+ def __init__(self, parser, thesaurus=None, customClass_symbols=None, customClass_words=None, keep4All=None):
+ """
+ :param parser: spacy-parser
+ :param thesaurus: [[syn1, syn2, ...],[syn1, syn2, ...], ...]
+ :param customClass_symbols:[str]
+ :param customClass_words:[str]
+ :param customClassPOS:[str]
+ :param keep4All: [str]
+ """
+ if thesaurus is None:
+ DATAPATH_thesaurus = "openthesaurus.csv"
+
+ ## !!!!!! list wichtig, da sonst nicht die gleichen Synonyme zurückgegeben werden, weil ein generator während der laufzeit pickt
+ self.thesaurus = list(textacy.fileio.read_csv(DATAPATH_thesaurus, delimiter=";"))
+ else:
+ self.thesaurus = thesaurus
+
+ self.parser = parser
+
+
+
+ self.whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE)
+ self.mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
+ self.emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
+ self.urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
+
+
+
+ # to remove
+ self.symbols = ["-----", "---", "...", "“", "”", ".", "-", "<", ">", ",", "?", "!", "..", "n’t", "n't", "|", "||",
+ ";", ":",
+ "…", "’s", "'s", ".", "(", ")", "[", "]", "#"] + (customClass_symbols if customClass_symbols is not None else [])
+ self.stop_words = list(__import__("spacy." + self.parser.lang, globals(), locals(), ['object']).STOP_WORDS)+ (customClass_words if customClass_words is not None else [])
+
+
+
+ # to keep
+ self.entities2keep = ["WORK_OF_ART", "ORG", "PRODUCT", "LOC"] # ,"PERSON"]
+ self.pos2keep = ["NOUN"] # , "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013
+
+ self.entities2keep = self.entities2keep + (keep4All if keep4All is not None else [])
+ self.pos2keep = self.pos2keep + (keep4All if keep4All is not None else [])
+
+
+ keep = (keep4All if hasattr(keep4All, '__iter__') else []) + self.pos2keep + self.entities2keep
+
+
+ # modify those to remove with those to keep
+ for sym in keep:
+ try:
+ self.symbols.remove(sym)
+ except ValueError:
+ pass
+ for sym in keep:
+ try:
+ self.stop_words.remove(sym)
+ except ValueError:
+ pass
+
+
+ # idee self.currentDoc = spacy.Doc für jeden String aber nicht füpr jede methode
+ def loadString(self,string):
+ self.currentDoc = self.parser(string)
+
+ """
+ def removeWhitespace(self, string):
+ string = self.whitespaceFinder.sub(" ", string)
+ return string
+ """
+ def removeWhitespace(self, string):
+ return string
+
+ #self.whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE)
+
+ def removePunctuation(self, string, custom_symbols=None, keep=None):
+
+
+ symbols = self.symbols + (custom_symbols if custom_symbols is not None else [])
+
+ if hasattr(keep, '__iter__'):
+ for k in keep:
+ try:
+ symbols.remove(k)
+ except ValueError:
+ pass
+
+
+ # parse with spaCy
+ doc = self.parser(string)
+ tokens = []
+
+ # append Tokens to a list
+ for tok in doc:
+ if not tok.is_punct and not tok.is_space and tok.text not in symbols:
+ tokens.append(tok.text)
+
+ return " ".join(tokens)
+
+ def keepPOSandENT(self, string, customPOS=None, customEnt=None, remove=None):
+
+ pos2keep = self.pos2keep + (customPOS if customPOS is not None else [])
+ ent = self.entities2keep + (customEnt if customEnt is not None else [])
+
+ if hasattr(remove, '__iter__'):
+ for k in remove:
+ try:
+ ent.remove(k)
+ except ValueError:
+ try:
+ pos2keep.remove(k)
+ except ValueError:
+ pass
+
+ # parse with spaCy
+ spacy_doc = self.parser(string)
+ tokens = []
+
+ # append Tokens to a list
+ for tok in spacy_doc:
+
+ if tok.pos_ in pos2keep:
+ tokens.append(tok.text)
+
+ if tok.ent_type_ in ent:
+ tokens.append(tok.text)
+
+ return " ".join(set(tokens))
+
+
+
+
+
+ def resolveAbbreviations(self,string):
+ return string #todo
+ def removeWords(self,string, custom_words=None, keep=None, lemmatize=False):
+
+ wordlist = self.stop_words + (custom_words if custom_words is not None else [])
+ if hasattr(keep, '__iter__'):
+ for k in keep:
+ try:
+ wordlist.remove(k)
+ except ValueError:
+ pass
+
+
+
+ string = self.urlFinder.sub("URL", string)
+ string = self.emailFinder.sub("EMAIL", string)
+ string = self.mentionFinder.sub("MENTION", string)
+ string = string.replace("&", "and").replace(">", ">").replace("<", "<")
+
+
+ # parse with spaCy
+ spacy_doc = self.parser(string)
+ tokens = []
+
+ # append Tokens to a list
+ for tok in spacy_doc:
+
+ #do not include stopwords/customwords and single chars
+ if tok.text not in wordlist and len(tok)>1:
+ if lemmatize:
+ tokens.append(tok.lemma_)
+ else:
+ tokens.append(tok.lower_)
+ return " ".join(set(tokens))
+
+
+ def normalizeSynonyms(self, string, default_return_first_Syn=False):
+ # parse with spaCy
+ spacy_doc = self.parser(string)
+ tokens = []
+
+ tokens = [str(self.getFirstSynonym(tok, self.thesaurus, default_return_first_Syn=default_return_first_Syn)) for tok in spacy_doc]
+
+ return " ".join(set(tokens))
+
+ def getFirstSynonym(self,word, thesaurus, default_return_first_Syn=False):
+ if not isinstance(word, str):
+ return word
+
+
+ word = word.lower()
+
+
+ # durch den thesaurrus iterieren
+ for syn_block in thesaurus: # syn_block ist eine liste mit Synonymen
+
+ for syn in syn_block:
+ syn = syn.lower()
+ if re.match(r'\A[\w-]+\Z', syn): # falls syn einzelwort ist
+ if word == syn:
+ return self.getHauptform(syn_block, word, default_return_first_Syn=default_return_first_Syn)
+ else: # falls es ein satz ist
+ if word in syn:
+ return self.getHauptform(syn_block, word, default_return_first_Syn=default_return_first_Syn)
+ return word # zur Not, das ursrpüngliche Wort zurückgeben
+
+ def getHauptform(self,syn_block, word, default_return_first_Syn=False):
+
+ for syn in syn_block:
+ syn = syn.lower()
+
+ if "hauptform" in syn and len(syn.split(" ")) <= 2:
+ # nicht ausgeben, falls es in Klammern steht
+ for w in syn.split(" "):
+ if not re.match(r'\([^)]+\)', w):
+ return w
+
+ if default_return_first_Syn:
+ # falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
+ for w in syn_block:
+ if not re.match(r'\([^)]+\)', w):
+ return w
+ return word # zur Not, das ursrpüngliche Wort zurückgeben
+
+
+
+
+"""
+#################################################################################################################
+
+#todo funzt irgendwie nich wie's soll: https://mathieularose.com/function-composition-in-python/
+def compose(self,*functions):
+ return functools.reduce(lambda f, g: lambda x: f(g(x)), functions, lambda x: x)
+
+pipeline = compose(functools.partial(cleaner.keepPOSandENT,lemmatize=True))#, cleaner.normalizeSynonyms)
+
+#################################################################################################################
+"""
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/openthesaurus.csv b/openthesaurus.csv
index ce336b3..e0c28df 100644
--- a/openthesaurus.csv
+++ b/openthesaurus.csv
@@ -1,4 +1,5 @@
-Kodewort;Schlüsselwort;Zugangscode;Passwort (Hauptform);Kennwort (Hauptform);Geheimcode;Losung;Codewort;Zugangswort;Losungswort;Parole
+Passwort (Hauptform);Kodewort;Schlüsselwort;Zugangscode;Kennwort (Hauptform);Geheimcode;Losung;Codewort;Zugangswort;Losungswort;Parole
+TH;Technische_Universität (Hauptform);Technische Hochschule;TU
Fission;Kernfission;Kernspaltung;Atomspaltung
Wiederaufnahme;Fortführung
davonfahren;abdüsen (ugs.);aufbrechen;abfliegen;abfahren;(von etwas) fortfahren;abreisen;wegfahren;wegfliegen
@@ -4207,7 +4208,6 @@ Akzise;Oktroi;Verbrauchsabgabe
Aufrührer;Tumultant
genügsam;bedürfnislos
zeigen;offenbaren;bekunden;kundtun
-TH;Technische Universität;Technische Hochschule;TU
Versprechen;Absichtserklärung (Nachrichtensprache);Zusicherung;Versicherung;Beteuerung
Beschaulichkeit;Stille
Auswärtiges Amt;Außenamt (ugs.);Außenministerium (ugs.);AA;Ministerium für Auswärtige Angelegenheiten
diff --git a/preprocessing.py b/preprocessing.py
index cfc29b5..7dda81c 100644
--- a/preprocessing.py
+++ b/preprocessing.py
@@ -1,24 +1,78 @@
# -*- coding: utf-8 -*-
import csv
-import random
+import functools
+import os.path
import re
-
+import subprocess
+import time
+import xml.etree.ElementTree as ET
+import sys
import spacy
import textacy
-import sys
+from scipy import *
+from textacy import Vectorizer
-import xml.etree.ElementTree as ET
-"""
-import keras
-import numpy as np
-from keras.layers import Dense, SimpleRNN, LSTM, TimeDistributed, Dropout
-from keras.models import Sequential
-import keras.backend as K
-"""
csv.field_size_limit(sys.maxsize)
+
+# Load the configuration file
+import configparser as ConfigParser
+config = ConfigParser.ConfigParser()
+with open("config.ini") as f:
+ config.read_file(f)
+
+
+
+path2xml = config.get("default","path2xml")
+
+PARSER = spacy.load(config.get("default","language"))
+corpus = textacy.Corpus(PARSER)
+
+thesauruspath = config.get("default","thesauruspath")
+THESAURUS = list(textacy.fileio.read_csv(thesauruspath, delimiter=";"))
+
+stop_words=list(__import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS) + config.get("preprocessing","custom_words").split(",")
+
+
+def compose(*functions):
+ def compose2(f, g):
+ return lambda x: f(g(x))
+ return functools.reduce(compose2, functions, lambda x: x)
+
+
+
+def generateMainTextfromTicketXML(path2xml, main_textfield='Beschreibung'):
+ """
+ generates strings from XML
+ :param path2xml:
+ :param main_textfield:
+ :param cleaning_function:
+ :yields strings
+ """
+ tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
+ root = tree.getroot()
+
+ for ticket in root:
+ for field in ticket:
+ if field.tag == main_textfield:
+ yield field.text
+
+def generateMetadatafromTicketXML(path2xml, leave_out=['Beschreibung']):
+ tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
+ root = tree.getroot()
+
+ for ticket in root:
+ metadata = {}
+ for field in ticket:
+ if field.tag not in leave_out:
+
+ metadata[field.tag] = field.text
+
+ yield metadata
+
def printRandomDoc(textacyCorpus):
+ import random
print()
print("len(textacyCorpus) = %i" % len(textacyCorpus))
@@ -27,219 +81,113 @@ def printRandomDoc(textacyCorpus):
print()
-"""
-def getFirstSynonym(word, thesaurus_gen):
-
- word = word.lower()
- # TODO word cleaning https://stackoverflow.com/questions/3939361/remove-specific-characters-from-a-string-in-python
-
-
- # durch den thesaurrus iterieren
- for syn_block in thesaurus_gen: # syn_block ist eine liste mit Synonymen
-
- # durch den synonymblock iterieren
- for syn in syn_block:
- syn = syn.lower().split(" ") if not re.match(r'\A[\w-]+\Z', syn) else syn # aus synonym mach liste (um evtl. sätze zu identifieziren)
-
- # falls das wort in dem synonym enthalten ist (also == einem Wort in der liste ist)
- if word in syn:
-
- # Hauptform suchen
- if "auptform" in syn:
- # nicht ausgeben, falls es in Klammern steht
- for w in syn:
- if not re.match(r'\([^)]+\)', w) and w is not None:
- return w
-
- # falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
- if len(syn) == 1:
- w = syn[0]
- if not re.match(r'\([^)]+\)', w) and w is not None:
- return w
-
- return word # zur Not die eingabe ausgeben
-"""
-
-def cleanText(string,custom_stopwords=None, custom_symbols=None, custom_words=None, customPreprocessing=None, lemmatize=False, normalize_synonyms=False):
-
- # use preprocessing
- if customPreprocessing is not None:
- string = customPreprocessing(string)
-
-
-
- if custom_stopwords is not None:
- custom_stopwords = custom_stopwords
- else:
- custom_stopwords = []
-
- if custom_words is not None:
- custom_words = custom_words
- else:
- custom_words = []
-
- if custom_symbols is not None:
- custom_symbols = custom_symbols
- else:
- custom_symbols = []
-
-
- # custom stoplist
- # https://stackoverflow.com/questions/9806963/how-to-use-pythons-import-function-properly-import
- stop_words = __import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS
-
- stoplist =list(stop_words) + custom_stopwords
- # List of symbols we don't care about either
- symbols = ["-----","---","...","“","”",".","-","<",">",",","?","!","..","n’t","n't","|","||",";",":","…","’s","'s",".","(",")","[","]","#"] + custom_symbols
-
-
-
- # get rid of newlines
- string = string.strip().replace("\n", " ").replace("\r", " ")
-
- # replace twitter
- mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
- string = mentionFinder.sub("MENTION", string)
-
- # replace emails
- emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
- string = emailFinder.sub("EMAIL", string)
-
- # replace urls
- urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
- string = urlFinder.sub("URL", string)
-
- # replace HTML symbols
- string = string.replace("&", "and").replace(">", ">").replace("<", "<")
-
-
-
-
- # parse with spaCy
- spacy_doc = PARSER(string)
- tokens = []
-
- added_entities = ["WORK_OF_ART","ORG","PRODUCT", "LOC"]#,"PERSON"]
- added_POS = ["NOUN", "NUM" ]#,"VERB","ADJ"] #IDEE NUM mit in den Corpus aufnehmen, aber fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013
-
- # append Tokens to a list
- for tok in spacy_doc:
- if tok.pos_ in added_POS:
- if lemmatize:
- tokens.append(tok.lemma_.lower().strip())
- else:
- tokens.append(tok.text.lower().strip())
-
- # add entities
- if tok.ent_type_ in added_entities:
- tokens.append(tok.text.lower())
-
-
-
- # remove stopwords
- tokens = [tok for tok in tokens if tok not in stoplist]
-
- # remove symbols
- tokens = [tok for tok in tokens if tok not in symbols]
-
- # remove custom_words
- tokens = [tok for tok in tokens if tok not in custom_words]
-
- # remove single characters
- tokens = [tok for tok in tokens if len(tok)>1]
-
- # remove large strings of whitespace
- while "" in tokens:
- tokens.remove("")
- while " " in tokens:
- tokens.remove(" ")
- while "\n" in tokens:
- tokens.remove("\n")
- while "\n\n" in tokens:
- tokens.remove("\n\n")
-
- #TODO abkürzungen auflösen (v.a. TU -> Technische Universität)
-
- if normalize_synonyms:
- tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens]
-
- return " ".join(tokens)
-
-def generateTextfromXML(path2xml, clean=True, textfield='Beschreibung'):
- import xml.etree.ElementTree as ET
-
- tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
-
- root = tree.getroot()
-
- for subject in root.iter(textfield):
- if clean:
- yield cleanText(subject.text)
- else:
- yield subject.text
-
-def generateMetadatafromXML(path2xml, keys=["Loesung","Kategorie","Zusammenfassung"]):
- import xml.etree.ElementTree as ET
-
- tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
-
- root = tree.getroot()
-
- metadata = dict.fromkeys(keys)
-
-
- for ticket in root.findall('ticket'):
- for key in metadata:
- metadata[key] = ticket.find(key).text
-
- yield metadata
-
-def generateFromXML(path2xml, textfield='Beschreibung', clean=False, normalize_Synonyms=False):
- import xml.etree.ElementTree as ET
-
- tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
- root = tree.getroot()
-
- for ticket in root:
- metadata = {}
- text = "ERROR"
- for field in ticket:
- if field.tag == textfield:
- if clean:
- text = cleanText(field.text,normalize_synonyms=normalize_Synonyms,lemmatize=False)
- else:
- text = field.text
+def processDictstream(dictstream, funcdict, parser=PARSER):
+ for dic in dictstream:
+ result = {}
+ for key, value in dic.items():
+ if key in funcdict:
+ result[key] = funcdict[key](parser(value))
else:
- #todo hier auch cleanen?
- metadata[field.tag] = field.text
- yield text, metadata
+ result[key] = value
+ yield result
-def getFirstSynonym(word, thesaurus_gen):
+def processTextstream(textstream, func, parser=PARSER):
+ # input str-stream output str-stream
+ pipe = parser.pipe(textstream)
+
+ for doc in pipe:
+ yield func(doc)
+
+
+
+
+def keepOnlyPOS(pos_list, parser=PARSER):
+ return lambda doc : parser(" ".join([tok.text for tok in doc if tok.pos_ in pos_list]))
+
+def removeAllPOS(pos_list, parser=PARSER):
+ return lambda doc: parser(" ".join([tok.text for tok in doc if tok.pos_ not in pos_list]))
+
+def keepOnlyENT(ent_list,parser=PARSER):
+ return lambda doc: parser(" ".join([tok.text for tok in doc if tok.ent_type_ in ent_list]))
+
+def removeAllENT(ent_list, parser=PARSER):
+ return lambda doc: parser(" ".join([tok.text for tok in doc if tok.ent_type_ not in ent_list]))
+
+def keepUniqueTokens(parser=PARSER):
+ return lambda doc: parser(" ".join(set([tok.text for tok in doc])))
+
+def lemmatize(parser=PARSER):
+ return lambda doc: parser(" ".join([tok.lemma_ for tok in doc]))
+
+doc2String = lambda doc : doc.text
+
+
+mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
+emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
+urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
+
+def replaceURLs(replace_with="URL",parser=PARSER):
+ #return lambda doc: parser(textacy.preprocess.replace_urls(doc.text,replace_with=replace_with))
+ return lambda doc: parser(urlFinder.sub(replace_with,doc.text))
+
+def replaceEmails(replace_with="EMAIL",parser=PARSER):
+ #return lambda doc: parser(textacy.preprocess.replace_emails(doc.text,replace_with=replace_with))
+ return lambda doc : parser(emailFinder.sub(replace_with, doc.text))
+
+def replaceTwitterMentions(replace_with="TWITTER_MENTION",parser=PARSER):
+ return lambda doc : parser(mentionFinder.sub(replace_with, doc.text))
+
+def replaceNumbers(replace_with="NUMBER",parser=PARSER):
+ return lambda doc: parser(textacy.preprocess.replace_numbers(doc.text, replace_with=replace_with))
+
+def replacePhonenumbers(replace_with="PHONE",parser=PARSER):
+ return lambda doc: parser(textacy.preprocess.replace_phone_numbers(doc.text, replace_with=replace_with))
+
+
+
+def resolveAbbreviations(parser=PARSER):
+ pass #todo
+
+def removeWords(words, keep=None,parser=PARSER):
+ if hasattr(keep, '__iter__'):
+ for k in keep:
+ try:
+ words.remove(k)
+ except ValueError:
+ pass
+ return lambda doc : parser(" ".join([tok.text for tok in doc if tok.lower_ not in words]))
+
+
+
+def normalizeSynonyms(default_return_first_Syn=False, parser=PARSER):
+ #return lambda doc : parser(" ".join([tok.lower_ for tok in doc]))
+ return lambda doc : parser(" ".join([getFirstSynonym(tok.lower_, THESAURUS, default_return_first_Syn=default_return_first_Syn) for tok in doc]))
+
+def getFirstSynonym(word, thesaurus, default_return_first_Syn=False):
+ if not isinstance(word, str):
+ return str(word)
word = word.lower()
-
# durch den thesaurrus iterieren
- for syn_block in thesaurus_gen: # syn_block ist eine liste mit Synonymen
+ for syn_block in thesaurus: # syn_block ist eine liste mit Synonymen
for syn in syn_block:
syn = syn.lower()
if re.match(r'\A[\w-]+\Z', syn): # falls syn einzelwort ist
if word == syn:
- return getHauptform(syn_block, word)
+ return str(getHauptform(syn_block, word, default_return_first_Syn=default_return_first_Syn))
else: # falls es ein satz ist
if word in syn:
- return getHauptform(syn_block, word)
- return word # zur Not, das ursrpüngliche Wort zurückgeben
-
+ return str(getHauptform(syn_block, word, default_return_first_Syn=default_return_first_Syn))
+ return str(word) # zur Not, das ursrpüngliche Wort zurückgeben
def getHauptform(syn_block, word, default_return_first_Syn=False):
-
for syn in syn_block:
syn = syn.lower()
if "hauptform" in syn and len(syn.split(" ")) <= 2:
- # nicht ausgeben, falls es in Klammern steht
+ # nicht ausgeben, falls es in Klammern steht#todo gibts macnmal?? klammern aus
for w in syn.split(" "):
if not re.match(r'\([^)]+\)', w):
return w
@@ -251,62 +199,181 @@ def getHauptform(syn_block, word, default_return_first_Syn=False):
return w
return word # zur Not, das ursrpüngliche Wort zurückgeben
+def label2ID(label):
+ return {
+ 'Neuanschluss' : 0,
+ 'LSF' : 1,
+ 'Video' : 2,
+ }.get(label,3)
-####################'####################'####################'####################'####################'##############
+def generate_labled_lines(textacyCorpus):
+ for doc in textacyCorpus:
+ # generate [topic1, topic2....] tok1 tok2 tok3 out of corpus
+ yield "[" + str(label2ID(doc.metadata["Kategorie"])) + "] " + doc.text
-import de_core_news_md
-DATAPATH = "ticketSamples.xml"
-DATAPATH_thesaurus = "openthesaurus.csv"
-LANGUAGE = 'de'
####################'####################'####################'####################'####################'##############
-PARSER = de_core_news_md.load()#spacy.load(LANGUAGE)
-THESAURUS_list=list(textacy.fileio.read_csv(DATAPATH_thesaurus, delimiter=";")) ## !!!!!! list wichtig, da sonst nicht die gleichen Synonyme zurückgegeben werden, weil der generator während der laufzeit pickt
+
+ents = config.get("preprocessing","ents2keep").split(",")
+
+
+clean_in_content=compose( #anmrk.: unterste-funktion iwrd zuerst ausgeführt
+
+ doc2String,
+ keepUniqueTokens(),
+ #normalizeSynonyms(default_return_first_Syn=False),
+ lemmatize(),
+ replaceEmails(),
+ replaceURLs(),
+ replaceTwitterMentions(),
+ #removeAllENT(ents),
+ keepOnlyPOS(['NOUN'])
+)
+
+
+
+clean_in_meta = {
+ "Loesung":removeAllPOS(["SPACE"]),
+ "Zusammenfassung":removeAllPOS(["SPACE","PUNCT"])
+}
-## files to textacy-corpus
-textacyCorpus = textacy.Corpus(PARSER)
-
+## add files to textacy-corpus,
print("add texts to textacy-corpus...")
-#textacyCorpus.add_texts(texts=generateTextfromXML(DATAPATH), metadatas=generateMetadatafromXML(DATAPATH))
-for txt, dic in generateFromXML(DATAPATH,normalize_Synonyms=True,clean=True):
- textacyCorpus.add_text(txt,dic)
+corpus.add_texts(
+ processTextstream(generateMainTextfromTicketXML(path2xml), func=clean_in_content),
+ processDictstream(generateMetadatafromTicketXML(path2xml), funcdict=clean_in_meta)
+)
+
+printRandomDoc(corpus)
+
+#idee 3 versch. Corpi
-for doc in textacyCorpus:
- print(doc.text)
+####################'####################'
-#print(textacyCorpus[2].text)
-#printRandomDoc(textacyCorpus)
-#print(textacyCorpus[len(textacyCorpus)-1].text)
+
+
+
+
+
+####################'####################' todo alles in config
+
+ngrams = (1,2)
+
+min_df = 0
+max_df = 1.0
+no_below = 20
+no_above = 0.5
+
+topicModel = 'lda'
+# http://textacy.readthedocs.io/en/latest/api_reference.html#textacy.tm.topic_model.TopicModel.get_doc_topic_matrix
+weighting = ('tf' if topicModel == 'lda' else 'tfidf')
+
+top_topic_words = 5
+top_document_labels_per_topic = 2
+
+n_topics = len(set(corpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
+
+
+
+
+
+
+####################'####################
+
+
+
+
+print("vectorize corpus...")
+vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
+
+terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=False, as_strings=True) for doc in corpus)
+doc_term_matrix = vectorizer.fit_transform(terms_list)
+id2term = vectorizer.__getattribute__("id_to_term")
+
+
+
+
+
+
+
+
+
+
+
+##################### LSA, LDA, NMF Topic Modeling via Textacy ##############################################
+
+# Initialize and train a topic model
+print("Initialize and train a topic model")
+model = textacy.tm.TopicModel(topicModel, n_topics=n_topics)
+model.fit(doc_term_matrix)
+
+#Transform the corpus and interpret our model:
+print("Transform the corpus and interpret our model")
+doc_topic_matrix = model.transform(doc_term_matrix)
+print()
+
+
+for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_topic_words):
+ print('topic', topic_idx, ':', ' '.join(top_terms))
+print()
+for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_document_labels_per_topic):
+ print(topic_idx)
+ for j in top_docs:
+ print(corpus[j].metadata['Kategorie'])
+
+#####################################################################################################################
print()
print()
-#################### 1
-
-PARSER = de_core_news_md.load()#spacy.load(LANGUAGE)
-
-## files to textacy-corpus
-textacyCorpus = textacy.Corpus(PARSER)
-
-for txt, dic in generateFromXML(DATAPATH, normalize_Synonyms=False, clean=True):
- textacyCorpus.add_text(txt,dic)
-
-
-for doc in textacyCorpus:
- print(doc.text)
+
+##################### LLDA Topic Modeling via JGibbsLabledLDA ##############################################
+
+
+
+jgibbsLLDA_root = "java_LabledLDA/"
+filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)
+
+
+#create file
+textacy.fileio.write_file_lines(generate_labled_lines(corpus),filepath=filepath)
+
+
+# wait for file to exist
+while not os.path.exists(filepath):
+ time.sleep(1)
+
+print("start LLDA..")
+#run JGibsslda file
+FNULL = open(os.devnull, 'w') # supress output
+subprocess.call(["java",
+ "-cp", "{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format(jgibbsLLDA_root),
+ "jgibblda.LDA",
+ "-est",
+ "-dir", "{0}models/tickets".format(jgibbsLLDA_root),
+ "-dfile","tickets.gz",
+ "-ntopics", str(n_topics)], stdout = FNULL)
+
+# ANMERKUNG: Dateien sind versteckt. zu finden in models/
+
+#print twords
+subprocess.call(["gzip",
+ "-dc",
+ "{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)])
+#####################################################################################################################
print()
print()
@@ -330,6 +397,5 @@ print()
-
diff --git a/test.py b/test.py
deleted file mode 100644
index f4e8009..0000000
--- a/test.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# -*- coding: utf-8 -*-
-import re
-
-import spacy
-import textacy
-import xml.etree.ElementTree as ET
-
-
-DATAPATH_thesaurus = "openthesaurus.csv"
-
-def generateFromXML(path2xml, clean=True, textfield='Beschreibung'):
- import xml.etree.ElementTree as ET
-
- tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
- root = tree.getroot()
-
- for ticket in root:
- metadata = {}
- text = "ERROR"
- for field in ticket:
- if field.tag == textfield:
- if clean:
- text = (field.text)
- else:
- text = field.text
- else:
- metadata[field.tag] = field.text
- yield text, metadata
-
-
-
-def getFirstSynonym(word, thesaurus_gen):
-
- word = word.lower()
-
- # durch den thesaurrus iterieren
- for syn_block in thesaurus_gen: # syn_block ist eine liste mit Synonymen
-
- for syn in syn_block:
- syn = syn.lower()
- if re.match(r'\A[\w-]+\Z', syn): #falls syn einzelwort ist
- if word == syn:
- return getHauptform(syn_block,word)
- else: # falls es ein satz ist
- if word in syn:
- return getHauptform(syn_block,word)
- return word #zur Not, das ursrpüngliche Wort zurückgeben
-
-
-
-
-def getHauptform(syn_block,word,default_return_first_Syn=False):
-
- for syn in syn_block:
- syn = syn.lower()
-
- if "hauptform" in syn:
- # nicht ausgeben, falls es in Klammern steht
- for w in syn.split(" "):
- if not re.match(r'\([^)]+\)', w):
- return w
-
- if default_return_first_Syn:
- # falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
- for w in syn_block:
- if not re.match(r'\([^)]+\)', w):
- return w
- return word # zur Not, das ursrpüngliche Wort zurückgeben
-
-THESAURUS_gen = list(textacy.fileio.read_csv(DATAPATH_thesaurus, delimiter=";")) # generator [[a,b,c,..],[a,b,c,..],...]
-
-strings = ["anmachen","Kernspaltung"]
-#strings = ["Kernspaltung","Kennwort"]
-
-for s in strings:
- print(getFirstSynonym(s,THESAURUS_gen))
-
-strings = ["Kennwort"]
-#THESAURUS_gen = textacy.fileio.read_csv(DATAPATH_thesaurus, delimiter=";") # generator [[a,b,c,..],[a,b,c,..],...]
-
-for s in strings:
- print(getFirstSynonym(s, THESAURUS_gen))
-
-
-
-
-
-
-