+
+
+
+
+
+
\ No newline at end of file
diff --git a/java_LabledLDA/LICENSE b/java_LabledLDA/LICENSE
new file mode 100644
index 0000000..d159169
--- /dev/null
+++ b/java_LabledLDA/LICENSE
@@ -0,0 +1,339 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The licenses for most software are designed to take away your
+freedom to share and change it. By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users. This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it. (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.) You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+ To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have. You must make sure that they, too, receive or can get the
+source code. And you must show them these terms so they know their
+rights.
+
+ We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+ Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software. If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+ Finally, any free program is threatened constantly by software
+patents. We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary. To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ GNU GENERAL PUBLIC LICENSE
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License. The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language. (Hereinafter, translation is included without limitation in
+the term "modification".) Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+ 1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+ 2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+ a) You must cause the modified files to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ b) You must cause any work that you distribute or publish, that in
+ whole or in part contains or is derived from the Program or any
+ part thereof, to be licensed as a whole at no charge to all third
+ parties under the terms of this License.
+
+ c) If the modified program normally reads commands interactively
+ when run, you must cause it, when started running for such
+ interactive use in the most ordinary way, to print or display an
+ announcement including an appropriate copyright notice and a
+ notice that there is no warranty (or else, saying that you provide
+ a warranty) and that users may redistribute the program under
+ these conditions, and telling the user how to view a copy of this
+ License. (Exception: if the Program itself is interactive but
+ does not normally print such an announcement, your work based on
+ the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+ 3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+ a) Accompany it with the complete corresponding machine-readable
+ source code, which must be distributed under the terms of Sections
+ 1 and 2 above on a medium customarily used for software interchange; or,
+
+ b) Accompany it with a written offer, valid for at least three
+ years, to give any third party, for a charge no more than your
+ cost of physically performing source distribution, a complete
+ machine-readable copy of the corresponding source code, to be
+ distributed under the terms of Sections 1 and 2 above on a medium
+ customarily used for software interchange; or,
+
+ c) Accompany it with the information you received as to the offer
+ to distribute corresponding source code. (This alternative is
+ allowed only for noncommercial distribution and only if you
+ received the program in object code or executable form with such
+ an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it. For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable. However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+ 4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License. Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+ 5. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Program or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+ 6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+ 7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all. For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+ 8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded. In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+ 9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation. If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+ 10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission. For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this. Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+ NO WARRANTY
+
+ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+
+ Copyright (C)
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+ Gnomovision version 69, Copyright (C) year name of author
+ Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary. Here is a sample; alter the names:
+
+ Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+ `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+ , 1 April 1989
+ Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs. If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library. If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
diff --git a/java_LabledLDA/LabledLDA.iml b/java_LabledLDA/LabledLDA.iml
new file mode 100644
index 0000000..39f2db5
--- /dev/null
+++ b/java_LabledLDA/LabledLDA.iml
@@ -0,0 +1,13 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/java_LabledLDA/README.md b/java_LabledLDA/README.md
new file mode 100644
index 0000000..a5d1262
--- /dev/null
+++ b/java_LabledLDA/README.md
@@ -0,0 +1,109 @@
+Labeled LDA in Java (based on JGibbLDA)
+=======================================
+
+This is a Java implementation of Labeled LDA based on the popular
+[JGibbLDA](http://jgibblda.sourceforge.net/) package. The code has been heavily
+refactored and a few additional options have been added. See sections below for
+more details.
+
+Data Format
+-----------
+
+The input data format is similar to the [JGibbLDA input data
+format](http://jgibblda.sourceforge.net/#_2.3._Input_Data_Format), with some
+minor cosmetic changes and additional support for document labels necessary for
+Labeled LDA. We first describe the (modified) input format for unlabeled
+documents, followed by the (new) input format for labeled documents.
+
+**Changed from JGibbLDA**: All input/output files must be Gzipped.
+
+### Unlabeled Documents
+
+Unlabeled documents have the following format:
+
+ document_1
+ document_2
+ ...
+ document_m
+
+where each document is a space-separated list of terms, i.e.,:
+
+ document_i = term_1 term_2 ... term_n
+
+**Changed from JGibbLDA**: The first line *should not* be an integer indicating
+the number of documents in the file. The original JGibbLDA code has been
+modified to identify the number of documents automatically.
+
+**Note**: Labeled and unlabeled documents may be mixed in the input file, thus
+you must ensure that unlabeled documents do not begin with a left square bracket
+(see Labeled Document input format below). One easy fix is to prepend a space
+character (' ') to each unlabeled document line.
+
+### Labeled Documents
+
+Labeled documents follow a format similar to unlabeled documents, but the with
+labels given at the beginning of each line and surrounded by square brackets,
+e.g.:
+
+ [label_1,1 label_1,2 ... label_1,l_1] document_1
+ [label_2,1 label_2,2 ... label_2,l_2] document_2
+ ...
+ [label_m,1 label_m,2 ... label_m,l_m] document_m
+
+where each label is an integer in the range [0, K-1], for K equal to the number
+of topics (-ntopics).
+
+**Note**: Labeled and unlabeled documents may be mixed in the input file. An
+unlabeled document is equivalent to labeling a document with every label in the
+range [0, K-1].
+
+Usage
+-----
+
+Please see the [JGibbLDA usage](http://jgibblda.sourceforge.net/#_2.2._Command_Line_&_Input_Parameter), noting the following changes:
+
+* All input files must be Gzipped. All output files are also Gzipped.
+
+* New options have been added:
+
+ **-nburnin **: Discard this many initial iterations when taking samples.
+
+ **-samplinglag **: The number of iterations between samples.
+
+ **-infseparately**: Inference is done separately for each document, as if
+ inference for each document was performed in isolation.
+
+ **-unlabeled**: Ignore document labels, i.e., treat every document as
+ unlabeled.
+
+* Some options have been deleted:
+
+ **-wordmap**: Filename is automatically built based on model path.
+
+Contact
+-------
+
+Please direct questions to [Myle Ott](myleott@gmail.com).
+
+License
+-------
+
+Following JGibbLDA, this code is licensed under the GPLv2. Please see the
+LICENSE file for the full license.
+
+Labeled LDA in Java
+Copyright (C) 2008-2013 Myle Ott (Labeled LDA), Xuan-Hieu Phan and Cam-Tu Nguyen (JGibbLDA)
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
diff --git a/java_LabledLDA/models/tickets/.others.gz b/java_LabledLDA/models/tickets/.others.gz
new file mode 100644
index 0000000..bd57ad4
Binary files /dev/null and b/java_LabledLDA/models/tickets/.others.gz differ
diff --git a/java_LabledLDA/models/tickets/.tassign.gz b/java_LabledLDA/models/tickets/.tassign.gz
new file mode 100644
index 0000000..07bf22c
Binary files /dev/null and b/java_LabledLDA/models/tickets/.tassign.gz differ
diff --git a/java_LabledLDA/models/tickets/.theta.gz b/java_LabledLDA/models/tickets/.theta.gz
new file mode 100644
index 0000000..8dc11ae
Binary files /dev/null and b/java_LabledLDA/models/tickets/.theta.gz differ
diff --git a/java_LabledLDA/models/tickets/.twords.gz b/java_LabledLDA/models/tickets/.twords.gz
new file mode 100644
index 0000000..608a547
Binary files /dev/null and b/java_LabledLDA/models/tickets/.twords.gz differ
diff --git a/java_LabledLDA/models/tickets/.wordmap.gz b/java_LabledLDA/models/tickets/.wordmap.gz
new file mode 100644
index 0000000..4df13f8
Binary files /dev/null and b/java_LabledLDA/models/tickets/.wordmap.gz differ
diff --git a/java_LabledLDA/models/tickets/tickets.gz b/java_LabledLDA/models/tickets/tickets.gz
new file mode 100644
index 0000000..8d3fe3a
Binary files /dev/null and b/java_LabledLDA/models/tickets/tickets.gz differ
diff --git a/java_LabledLDA/out/production/LabledLDA/jgibblda/Dictionary.class b/java_LabledLDA/out/production/LabledLDA/jgibblda/Dictionary.class
new file mode 100644
index 0000000..ba24d8f
Binary files /dev/null and b/java_LabledLDA/out/production/LabledLDA/jgibblda/Dictionary.class differ
diff --git a/java_LabledLDA/out/production/LabledLDA/jgibblda/Document.class b/java_LabledLDA/out/production/LabledLDA/jgibblda/Document.class
new file mode 100644
index 0000000..defecd7
Binary files /dev/null and b/java_LabledLDA/out/production/LabledLDA/jgibblda/Document.class differ
diff --git a/java_LabledLDA/out/production/LabledLDA/jgibblda/Estimator.class b/java_LabledLDA/out/production/LabledLDA/jgibblda/Estimator.class
new file mode 100644
index 0000000..efa7ee8
Binary files /dev/null and b/java_LabledLDA/out/production/LabledLDA/jgibblda/Estimator.class differ
diff --git a/java_LabledLDA/out/production/LabledLDA/jgibblda/Inferencer.class b/java_LabledLDA/out/production/LabledLDA/jgibblda/Inferencer.class
new file mode 100644
index 0000000..9ec8304
Binary files /dev/null and b/java_LabledLDA/out/production/LabledLDA/jgibblda/Inferencer.class differ
diff --git a/java_LabledLDA/out/production/LabledLDA/jgibblda/LDA.class b/java_LabledLDA/out/production/LabledLDA/jgibblda/LDA.class
new file mode 100644
index 0000000..3e6ed60
Binary files /dev/null and b/java_LabledLDA/out/production/LabledLDA/jgibblda/LDA.class differ
diff --git a/java_LabledLDA/out/production/LabledLDA/jgibblda/LDACmdOption.class b/java_LabledLDA/out/production/LabledLDA/jgibblda/LDACmdOption.class
new file mode 100644
index 0000000..b0f7878
Binary files /dev/null and b/java_LabledLDA/out/production/LabledLDA/jgibblda/LDACmdOption.class differ
diff --git a/java_LabledLDA/out/production/LabledLDA/jgibblda/LDADataset.class b/java_LabledLDA/out/production/LabledLDA/jgibblda/LDADataset.class
new file mode 100644
index 0000000..f9dd6cf
Binary files /dev/null and b/java_LabledLDA/out/production/LabledLDA/jgibblda/LDADataset.class differ
diff --git a/java_LabledLDA/out/production/LabledLDA/jgibblda/Model.class b/java_LabledLDA/out/production/LabledLDA/jgibblda/Model.class
new file mode 100644
index 0000000..1d97a68
Binary files /dev/null and b/java_LabledLDA/out/production/LabledLDA/jgibblda/Model.class differ
diff --git a/java_LabledLDA/out/production/LabledLDA/jgibblda/Pair.class b/java_LabledLDA/out/production/LabledLDA/jgibblda/Pair.class
new file mode 100644
index 0000000..df577a4
Binary files /dev/null and b/java_LabledLDA/out/production/LabledLDA/jgibblda/Pair.class differ
diff --git a/java_LabledLDA/src/jgibblda/Dictionary.java b/java_LabledLDA/src/jgibblda/Dictionary.java
new file mode 100644
index 0000000..cbff86b
--- /dev/null
+++ b/java_LabledLDA/src/jgibblda/Dictionary.java
@@ -0,0 +1,150 @@
+/*
+ * Copyright (C) 2007 by
+ *
+ * Xuan-Hieu Phan
+ * hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com
+ * Graduate School of Information Sciences
+ * Tohoku University
+ *
+ * Cam-Tu Nguyen
+ * ncamtu@gmail.com
+ * College of Technology
+ * Vietnam National University, Hanoi
+ *
+ * JGibbsLDA is a free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License,
+ * or (at your option) any later version.
+ *
+ * JGibbsLDA is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with JGibbsLDA; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ */
+package jgibblda;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.util.zip.GZIPInputStream;
+import java.util.zip.GZIPOutputStream;
+
+import gnu.trove.map.hash.TObjectIntHashMap;
+import gnu.trove.map.hash.TIntObjectHashMap;
+
+public class Dictionary {
+ public TObjectIntHashMap word2id;
+ public TIntObjectHashMap id2word;
+
+ //--------------------------------------------------
+ // constructors
+ //--------------------------------------------------
+
+ public Dictionary(){
+ word2id = new TObjectIntHashMap();
+ id2word = new TIntObjectHashMap();
+ }
+
+ //---------------------------------------------------
+ // get/set methods
+ //---------------------------------------------------
+
+ public String getWord(int id){
+ return id2word.get(id);
+ }
+
+ public int getID(String word){
+ return word2id.get(word);
+ }
+
+ //----------------------------------------------------
+ // checking methods
+ //----------------------------------------------------
+ /**
+ * check if this dictionary contains a specified word
+ */
+ public boolean contains(String word){
+ return word2id.containsKey(word);
+ }
+
+ public boolean contains(int id){
+ return id2word.containsKey(id);
+ }
+ //---------------------------------------------------
+ // manupulating methods
+ //---------------------------------------------------
+ /**
+ * add a word into this dictionary
+ * return the corresponding id
+ */
+ public int addWord(String word){
+ if (!contains(word)){
+ int id = word2id.size();
+
+ word2id.put(word, id);
+ id2word.put(id,word);
+
+ return id;
+ }
+ else return getID(word);
+ }
+
+ //---------------------------------------------------
+ // I/O methods
+ //---------------------------------------------------
+ /**
+ * read dictionary from file
+ */
+ public boolean readWordMap(String wordMapFile)
+ {
+ try {
+ BufferedReader reader = new BufferedReader(new InputStreamReader(
+ new GZIPInputStream(
+ new FileInputStream(wordMapFile)), "UTF-8"));
+ String line;
+
+ for (int i = 0; (line = reader.readLine()) != null; i++) {
+ String word = line.trim();
+ id2word.put(i, word);
+ word2id.put(word, i);
+ }
+
+ reader.close();
+ return true;
+ }
+ catch (Exception e) {
+ System.out.println("Error while reading dictionary:" + e.getMessage());
+ e.printStackTrace();
+ return false;
+ }
+ }
+
+ public boolean writeWordMap(String wordMapFile)
+ {
+ try {
+ BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
+ new GZIPOutputStream(
+ new FileOutputStream(wordMapFile)), "UTF-8"));
+
+ //write word to id
+ for (int i = 0; i < id2word.size(); i++) {
+ writer.write(id2word.get(i) + "\n");
+ }
+
+ writer.close();
+ return true;
+ }
+ catch (Exception e) {
+ System.out.println("Error while writing word map " + e.getMessage());
+ e.printStackTrace();
+ return false;
+ }
+ }
+}
diff --git a/java_LabledLDA/src/jgibblda/Document.java b/java_LabledLDA/src/jgibblda/Document.java
new file mode 100644
index 0000000..2bf8ceb
--- /dev/null
+++ b/java_LabledLDA/src/jgibblda/Document.java
@@ -0,0 +1,62 @@
+/*
+ * Copyright (C) 2007 by
+ *
+ * Xuan-Hieu Phan
+ * hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com
+ * Graduate School of Information Sciences
+ * Tohoku University
+ *
+ * Cam-Tu Nguyen
+ * ncamtu@gmail.com
+ * College of Technology
+ * Vietnam National University, Hanoi
+ *
+ * JGibbsLDA is a free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License,
+ * or (at your option) any later version.
+ *
+ * JGibbsLDA is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with JGibbsLDA; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ */
+
+package jgibblda;
+
+import gnu.trove.list.array.TIntArrayList;
+
+public class Document {
+
+ //----------------------------------------------------
+ //Instance Variables
+ //----------------------------------------------------
+ public int[] words;
+ public String rawStr = "";
+ public int length;
+ public int[] labels = null;
+
+ public Document(TIntArrayList doc){
+ this.length = doc.size();
+ this.words = new int[length];
+ for (int i = 0; i < length; i++){
+ this.words[i] = doc.get(i);
+ }
+ }
+
+ public Document(TIntArrayList doc, String rawStr)
+ {
+ this(doc);
+ this.rawStr = rawStr;
+ }
+
+ public Document(TIntArrayList doc, String rawStr, TIntArrayList tlabels)
+ {
+ this(doc, rawStr);
+ this.labels = tlabels != null ? tlabels.toArray() : null;
+ }
+}
diff --git a/java_LabledLDA/src/jgibblda/Estimator.java b/java_LabledLDA/src/jgibblda/Estimator.java
new file mode 100644
index 0000000..c7cd757
--- /dev/null
+++ b/java_LabledLDA/src/jgibblda/Estimator.java
@@ -0,0 +1,145 @@
+/*
+ * Copyright (C) 2007 by
+ *
+ * Xuan-Hieu Phan
+ * hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com
+ * Graduate School of Information Sciences
+ * Tohoku University
+ *
+ * Cam-Tu Nguyen
+ * ncamtu@gmail.com
+ * College of Technology
+ * Vietnam National University, Hanoi
+ *
+ * JGibbsLDA is a free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License,
+ * or (at your option) any later version.
+ *
+ * JGibbsLDA is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with JGibbsLDA; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ */
+
+package jgibblda;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+
+public class Estimator
+{
+ // output model
+ protected Model trnModel;
+ LDACmdOption option;
+
+ public Estimator(LDACmdOption option) throws FileNotFoundException, IOException
+ {
+ this.option = option;
+
+ trnModel = new Model(option);
+
+ if (option.est){
+ trnModel.init(true);
+ }
+ else if (option.estc){
+ trnModel.init(false);
+ }
+ }
+
+ public void estimate()
+ {
+ System.out.println("Sampling " + trnModel.niters + " iterations!");
+ System.out.print("Iteration");
+ for (int startIter = ++trnModel.liter; trnModel.liter <= startIter - 1 + trnModel.niters; trnModel.liter++){
+ System.out.format("%6d", trnModel.liter);
+
+ // for all z_i
+ for (int m = 0; m < trnModel.M; m++){
+ for (int n = 0; n < trnModel.data.docs.get(m).length; n++){
+ // z_i = z[m][n]
+ // sample from p(z_i|z_-i, w)
+ int topic = sampling(m, n);
+ trnModel.z[m].set(n, topic);
+ }// end for each word
+ }// end for each document
+
+ if ((trnModel.liter == startIter - 1 + trnModel.niters) ||
+ (trnModel.liter > trnModel.nburnin && trnModel.liter % trnModel.samplingLag == 0)) {
+ trnModel.updateParams();
+ }
+
+ System.out.print("\b\b\b\b\b\b");
+ }// end iterations
+ trnModel.liter--;
+
+ System.out.println("\nSaving the final model!");
+ trnModel.saveModel();
+ }
+
+ /**
+ * Do sampling
+ * @param m document number
+ * @param n word number
+ * @return topic id
+ */
+ public int sampling(int m, int n)
+ {
+ // remove z_i from the count variable
+ int topic = trnModel.z[m].get(n);
+ int w = trnModel.data.docs.get(m).words[n];
+
+ trnModel.nw[w][topic] -= 1;
+ trnModel.nd[m][topic] -= 1;
+ trnModel.nwsum[topic] -= 1;
+ trnModel.ndsum[m] -= 1;
+
+ double Vbeta = trnModel.V * trnModel.beta;
+
+ // get labels for this document
+ int[] labels = trnModel.data.docs.get(m).labels;
+
+ // determine number of possible topics for this document
+ int K_m = (labels == null) ? trnModel.K : labels.length;
+
+ // do multinominal sampling via cumulative method
+ double[] p = trnModel.p;
+ for (int k = 0; k < K_m; k++) {
+ topic = labels == null ? k : labels[k];
+
+ p[k] = (trnModel.nd[m][topic] + trnModel.alpha) *
+ (trnModel.nw[w][topic] + trnModel.beta) /
+ (trnModel.nwsum[topic] + Vbeta);
+ }
+
+ // cumulate multinomial parameters
+ for (int k = 1; k < K_m; k++) {
+ p[k] += p[k - 1];
+ }
+
+ // scaled sample because of unnormalized p[]
+ double u = Math.random() * p[K_m - 1];
+
+ for (topic = 0; topic < K_m; topic++){
+ if (p[topic] > u) //sample topic w.r.t distribution p
+ break;
+ }
+
+ // map [0, K_m - 1] topic to [0, K - 1] topic according to labels
+ if (labels != null) {
+ topic = labels[topic];
+ }
+
+ // add newly estimated z_i to count variables
+ trnModel.nw[w][topic] += 1;
+ trnModel.nd[m][topic] += 1;
+ trnModel.nwsum[topic] += 1;
+ trnModel.ndsum[m] += 1;
+
+ return topic;
+ }
+}
diff --git a/java_LabledLDA/src/jgibblda/Inferencer.java b/java_LabledLDA/src/jgibblda/Inferencer.java
new file mode 100644
index 0000000..159557b
--- /dev/null
+++ b/java_LabledLDA/src/jgibblda/Inferencer.java
@@ -0,0 +1,178 @@
+/*
+ * Copyright (C) 2007 by
+ *
+ * Xuan-Hieu Phan
+ * hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com
+ * Graduate School of Information Sciences
+ * Tohoku University
+ *
+ * Cam-Tu Nguyen
+ * ncamtu@gmail.com
+ * College of Technology
+ * Vietnam National University, Hanoi
+ *
+ * JGibbsLDA is a free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License,
+ * or (at your option) any later version.
+ *
+ * JGibbsLDA is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with JGibbsLDA; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ */
+
+package jgibblda;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+
+public class Inferencer
+{
+ // Train model
+ public Model trnModel;
+ public Dictionary globalDict;
+ private LDACmdOption option;
+
+ private Model newModel;
+
+ //-----------------------------------------------------
+ // Init method
+ //-----------------------------------------------------
+ public Inferencer(LDACmdOption option) throws FileNotFoundException, IOException
+ {
+ this.option = option;
+
+ trnModel = new Model(option);
+ trnModel.init(false);
+
+ globalDict = trnModel.data.localDict;
+ }
+
+ //inference new model ~ getting data from a specified dataset
+ public Model inference() throws FileNotFoundException, IOException
+ {
+ newModel = new Model(option, trnModel);
+ newModel.init(true);
+ newModel.initInf();
+
+ System.out.println("Sampling " + newModel.niters + " iterations for inference!");
+ System.out.print("Iteration");
+ for (newModel.liter = 1; newModel.liter <= newModel.niters; newModel.liter++){
+ System.out.format("%6d", newModel.liter);
+
+ // for all newz_i
+ for (int m = 0; m < newModel.M; ++m){
+ for (int n = 0; n < newModel.data.docs.get(m).length; n++){
+ // sample from p(z_i|z_-1,w)
+ int topic = infSampling(m, n);
+ newModel.z[m].set(n, topic);
+ }
+ }//end foreach new doc
+
+ if ((newModel.liter == newModel.niters) ||
+ (newModel.liter > newModel.nburnin && newModel.liter % newModel.samplingLag == 0)) {
+ newModel.updateParams(trnModel);
+ }
+
+ System.out.print("\b\b\b\b\b\b");
+ }// end iterations
+ newModel.liter--;
+
+ System.out.println("\nSaving the inference outputs!");
+ String outputPrefix = newModel.dfile;
+ if (outputPrefix.endsWith(".gz")) {
+ outputPrefix = outputPrefix.substring(0, outputPrefix.length() - 3);
+ }
+ newModel.saveModel(outputPrefix + ".");
+
+ return newModel;
+ }
+
+ /**
+ * do sampling for inference
+ * m: document number
+ * n: word number?
+ */
+ protected int infSampling(int m, int n)
+ {
+ // remove z_i from the count variables
+ int topic = newModel.z[m].get(n);
+ int _w = newModel.data.docs.get(m).words[n];
+ int w = newModel.data.lid2gid.get(_w);
+
+ newModel.nw[_w][topic] -= 1;
+ newModel.nd[m][topic] -= 1;
+ newModel.nwsum[topic] -= 1;
+ newModel.ndsum[m] -= 1;
+
+ int[] nw_inf_m__w = null;
+ if (option.infSeparately) {
+ nw_inf_m__w = newModel.nw_inf.get(m).get(_w);
+ nw_inf_m__w[topic] -= 1;
+ newModel.nwsum_inf[m][topic] -= 1;
+ }
+
+ double Vbeta = trnModel.V * newModel.beta;
+
+ // get labels for this document
+ int[] labels = newModel.data.docs.get(m).labels;
+
+ // determine number of possible topics for this document
+ int K_m = (labels == null) ? newModel.K : labels.length;
+
+ // do multinomial sampling via cumulative method
+ double[] p = newModel.p;
+ for (int k = 0; k < K_m; k++) {
+ topic = labels == null ? k : labels[k];
+
+ int nw_k, nwsum_k;
+ if (option.infSeparately) {
+ nw_k = nw_inf_m__w[topic];
+ nwsum_k = newModel.nwsum_inf[m][topic];
+ } else {
+ nw_k = newModel.nw[_w][topic];
+ nwsum_k = newModel.nwsum[topic];
+ }
+
+ p[k] = (newModel.nd[m][topic] + newModel.alpha) *
+ (trnModel.nw[w][topic] + nw_k + newModel.beta) /
+ (trnModel.nwsum[topic] + nwsum_k + Vbeta);
+ }
+
+ // cumulate multinomial parameters
+ for (int k = 1; k < K_m; k++){
+ p[k] += p[k - 1];
+ }
+
+ // scaled sample because of unnormalized p[]
+ double u = Math.random() * p[K_m - 1];
+
+ for (topic = 0; topic < K_m; topic++){
+ if (p[topic] > u)
+ break;
+ }
+
+ // map [0, K_m - 1] topic to [0, K - 1] topic according to labels
+ if (labels != null) {
+ topic = labels[topic];
+ }
+
+ // add newly estimated z_i to count variables
+ newModel.nw[_w][topic] += 1;
+ newModel.nd[m][topic] += 1;
+ newModel.nwsum[topic] += 1;
+ newModel.ndsum[m] += 1;
+
+ if (option.infSeparately) {
+ nw_inf_m__w[topic] += 1;
+ newModel.nwsum_inf[m][topic] += 1;
+ }
+
+ return topic;
+ }
+}
diff --git a/java_LabledLDA/src/jgibblda/LDA.java b/java_LabledLDA/src/jgibblda/LDA.java
new file mode 100644
index 0000000..87e24e9
--- /dev/null
+++ b/java_LabledLDA/src/jgibblda/LDA.java
@@ -0,0 +1,78 @@
+/*
+ * Copyright (C) 2007 by
+ *
+ * Xuan-Hieu Phan
+ * hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com
+ * Graduate School of Information Sciences
+ * Tohoku University
+ *
+ * Cam-Tu Nguyen
+ * ncamtu@gmail.com
+ * College of Technology
+ * Vietnam National University, Hanoi
+ *
+ * JGibbsLDA is a free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License,
+ * or (at your option) any later version.
+ *
+ * JGibbsLDA is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with JGibbsLDA; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ */
+
+package jgibblda;
+
+import java.io.FileNotFoundException;
+
+import org.kohsuke.args4j.*;
+
+
+public class LDA
+{
+ public static void main(String args[])
+ {
+ LDACmdOption option = new LDACmdOption();
+ CmdLineParser parser = new CmdLineParser(option);
+
+ try {
+ if (args.length == 0){
+ showHelp(parser);
+ return;
+ }
+
+ parser.parseArgument(args);
+
+ if (option.est || option.estc){
+ Estimator estimator = new Estimator(option);
+ estimator.estimate();
+ }
+ else if (option.inf){
+ Inferencer inferencer = new Inferencer(option);
+ Model newModel = inferencer.inference();
+ }
+ } catch (CmdLineException cle){
+ System.out.println("Command line error: " + cle.getMessage());
+ showHelp(parser);
+ return;
+ } catch (FileNotFoundException e) {
+ e.printStackTrace();
+ return;
+ } catch (Exception e){
+ System.out.println("Error in main: " + e.getMessage());
+ e.printStackTrace();
+ return;
+ }
+ }
+
+ public static void showHelp(CmdLineParser parser){
+ System.out.println("LDA [options ...] [arguments...]");
+ parser.printUsage(System.out);
+ }
+
+}
diff --git a/java_LabledLDA/src/jgibblda/LDACmdOption.java b/java_LabledLDA/src/jgibblda/LDACmdOption.java
new file mode 100644
index 0000000..e4ab0b4
--- /dev/null
+++ b/java_LabledLDA/src/jgibblda/LDACmdOption.java
@@ -0,0 +1,51 @@
+package jgibblda;
+
+import org.kohsuke.args4j.*;
+
+public class LDACmdOption {
+
+ @Option(name="-est", usage="Specify whether we want to estimate model from scratch")
+ public boolean est = false;
+
+ @Option(name="-estc", usage="Specify whether we want to continue the last estimation")
+ public boolean estc = false;
+
+ @Option(name="-inf", usage="Specify whether we want to do inference")
+ public boolean inf = true;
+
+ @Option(name="-infseparately", usage="Do inference for each document separately")
+ public boolean infSeparately = false;
+
+ @Option(name="-unlabeled", usage="Ignore document labels")
+ public boolean unlabeled = false;
+
+ @Option(name="-dir", usage="Specify directory")
+ public String dir = "";
+
+ @Option(name="-dfile", usage="Specify data file (*.gz)")
+ public String dfile = "";
+
+ @Option(name="-model", usage="Specify the model name")
+ public String modelName = "";
+
+ @Option(name="-alpha", usage="Specify alpha")
+ public double alpha = -1;
+
+ @Option(name="-beta", usage="Specify beta")
+ public double beta = -1;
+
+ @Option(name="-ntopics", usage="Specify the number of topics")
+ public int K = 100;
+
+ @Option(name="-niters", usage="Specify the number of iterations")
+ public int niters = 1000;
+
+ @Option(name="-nburnin", usage="Specify the number of burn-in iterations")
+ public int nburnin = 500;
+
+ @Option(name="-samplinglag", usage="Specify the sampling lag")
+ public int samplingLag = 5;
+
+ @Option(name="-twords", usage="Specify the number of most likely words to be printed for each topic")
+ public int twords = 100;
+}
diff --git a/java_LabledLDA/src/jgibblda/LDADataset.java b/java_LabledLDA/src/jgibblda/LDADataset.java
new file mode 100644
index 0000000..2ff89af
--- /dev/null
+++ b/java_LabledLDA/src/jgibblda/LDADataset.java
@@ -0,0 +1,179 @@
+/*
+ * Copyright (C) 2007 by
+ *
+ * Xuan-Hieu Phan
+ * hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com
+ * Graduate School of Information Sciences
+ * Tohoku University
+ *
+ * Cam-Tu Nguyen
+ * ncamtu@gmail.com
+ * College of Technology
+ * Vietnam National University, Hanoi
+ *
+ * JGibbsLDA is a free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License,
+ * or (at your option) any later version.
+ *
+ * JGibbsLDA is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with JGibbsLDA; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ */
+package jgibblda;
+
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.zip.GZIPInputStream;
+
+import gnu.trove.list.array.TIntArrayList;
+import gnu.trove.map.hash.TIntIntHashMap;
+import gnu.trove.set.hash.TIntHashSet;
+
+public class LDADataset {
+ //---------------------------------------------------------------
+ // Instance Variables
+ //---------------------------------------------------------------
+
+ public Dictionary localDict = new Dictionary(); // local dictionary
+ public ArrayList docs = new ArrayList(); // a list of documents
+ public int M = 0; // number of documents
+ public int V = 0; // number of words
+
+ // map from local coordinates (id) to global ones
+ // null if the global dictionary is not set
+ public TIntIntHashMap lid2gid = null;
+
+ //link to a global dictionary (optional), null for train data, not null for test data
+ public Dictionary globalDict = null;
+
+ //-------------------------------------------------------------
+ //Public Instance Methods
+ //-------------------------------------------------------------
+ public void setM(int M)
+ {
+ this.M = M;
+ }
+
+ public void setDictionary(Dictionary globalDict)
+ {
+ lid2gid = new TIntIntHashMap();
+ this.globalDict = globalDict;
+ }
+
+ /**
+ * set the document at the index idx if idx is greater than 0 and less than M
+ * @param doc document to be set
+ * @param idx index in the document array
+ */
+ public void setDoc(Document doc, int idx){
+ if (idx < docs.size()) {
+ docs.set(idx, doc);
+ } else {
+ docs.add(idx, doc);
+ }
+ }
+
+ /**
+ * add a new document
+ * @param str string contains doc
+ */
+ public void addDoc(String str, boolean unlabeled)
+ {
+ // read document labels (if provided)
+ TIntArrayList labels = null;
+ if (str.startsWith("[")) {
+ String[] labelsBoundary = str.
+ substring(1). // remove initial '['
+ split("]", 2); // separate labels and str between ']'
+ String[] labelStrs = labelsBoundary[0].trim().split("[ \\t]");
+ str = labelsBoundary[1].trim();
+
+ // parse labels (unless we're ignoring the labels)
+ if (!unlabeled) {
+ // store labels in a HashSet to ensure uniqueness
+ TIntHashSet label_set = new TIntHashSet();
+ for (String labelStr : labelStrs) {
+ try {
+ label_set.add(Integer.parseInt(labelStr.trim()));
+ } catch (NumberFormatException nfe) {
+ System.err.println("Unknown document label ( " + labelStr + " ) for document " + docs.size() + ".");
+ }
+ }
+ labels = new TIntArrayList(label_set);
+ labels.sort();
+ }
+ }
+
+ String[] words = str.split("[ \\t\\n]");
+ TIntArrayList ids = new TIntArrayList();
+ for (String word : words){
+ if (word.trim().equals("")) {
+ continue;
+ }
+
+ int _id = localDict.word2id.size();
+
+ if (localDict.contains(word))
+ _id = localDict.getID(word);
+
+ if (globalDict != null) {
+ //get the global id
+ if (globalDict.contains(word)) {
+ localDict.addWord(word);
+
+ lid2gid.put(_id, globalDict.getID(word));
+ ids.add(_id);
+ }
+ }
+ else {
+ localDict.addWord(word);
+ ids.add(_id);
+ }
+ }
+
+ setDoc(new Document(ids, str, labels), docs.size());
+
+ V = localDict.word2id.size();
+ }
+
+ //---------------------------------------------------------------
+ // I/O methods
+ //---------------------------------------------------------------
+
+ /**
+ * read a dataset from a file
+ * @return true if success and false otherwise
+ */
+ public boolean readDataSet(String filename, boolean unlabeled) throws FileNotFoundException, IOException
+ {
+ BufferedReader reader = new BufferedReader(new InputStreamReader(
+ new GZIPInputStream(
+ new FileInputStream(filename)), "UTF-8"));
+ try {
+ String line;
+ while ((line = reader.readLine()) != null) {
+ addDoc(line, unlabeled);
+ }
+ setM(docs.size());
+
+ // debug output
+ System.out.println("Dataset loaded:");
+ System.out.println("\tM:" + M);
+ System.out.println("\tV:" + V);
+
+ return true;
+ } finally {
+ reader.close();
+ }
+ }
+}
diff --git a/java_LabledLDA/src/jgibblda/Model.java b/java_LabledLDA/src/jgibblda/Model.java
new file mode 100644
index 0000000..22f4c09
--- /dev/null
+++ b/java_LabledLDA/src/jgibblda/Model.java
@@ -0,0 +1,669 @@
+/*
+ * Copyright (C) 2007 by
+ *
+ * Xuan-Hieu Phan
+ * hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com
+ * Graduate School of Information Sciences
+ * Tohoku University
+ *
+ * Cam-Tu Nguyen
+ * ncamtu@gmail.com
+ * College of Technology
+ * Vietnam National University, Hanoi
+ *
+ * JGibbsLDA is a free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License,
+ * or (at your option) any later version.
+ *
+ * JGibbsLDA is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with JGibbsLDA; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ */
+package jgibblda;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.StringTokenizer;
+import java.util.zip.GZIPInputStream;
+import java.util.zip.GZIPOutputStream;
+
+import gnu.trove.list.array.TIntArrayList;
+import gnu.trove.map.hash.TIntObjectHashMap;
+
+public class Model {
+
+ //---------------------------------------------------------------
+ // Class Variables
+ //---------------------------------------------------------------
+
+ public static String tassignSuffix = ".tassign.gz"; // suffix for topic assignment file
+ public static String thetaSuffix = ".theta.gz"; // suffix for theta (topic - document distribution) file
+ public static String phiSuffix = ".phi.gz"; // suffix for phi file (topic - word distribution) file
+ public static String othersSuffix = ".others.gz"; // suffix for containing other parameters
+ public static String twordsSuffix = ".twords.gz"; // suffix for file containing words-per-topics
+ public static String wordMapSuffix = ".wordmap.gz"; // suffix for file containing word to id map
+
+ //---------------------------------------------------------------
+ // Model Parameters and Variables
+ //---------------------------------------------------------------
+
+
+ public String dir = "./";
+ public String dfile = "trndocs.dat";
+ public boolean unlabeled = false;
+ public String modelName = "model";
+ public LDADataset data; // link to a dataset
+
+ public int M = 0; // dataset size (i.e., number of docs)
+ public int V = 0; // vocabulary size
+ public int K = 100; // number of topics
+ public double alpha; // LDA hyperparameters
+ public double beta = 0.01; // LDA hyperparameters
+ public int niters = 1000; // number of Gibbs sampling iteration
+ public int nburnin = 500; // number of Gibbs sampling burn-in iterations
+ public int samplingLag = 5;// Gibbs sampling sample lag
+ public int numSamples = 1; // number of samples taken
+ public int liter = 0; // the iteration at which the model was saved
+ public int twords = 20; // print out top words per each topic
+
+ // Estimated/Inferenced parameters
+ public double[][] theta = null; // theta: document - topic distributions, size M x K
+ public double[][] phi = null; // phi: topic-word distributions, size K x V
+
+ // Temp variables while sampling
+ public TIntArrayList[] z = null; // topic assignments for words, size M x doc.size()
+ protected int[][] nw = null; // nw[i][j]: number of instances of word/term i assigned to topic j, size V x K
+ protected int[][] nd = null; // nd[i][j]: number of words in document i assigned to topic j, size M x K
+ protected int[] nwsum = null; // nwsum[j]: total number of words assigned to topic j, size K
+ protected int[] ndsum = null; // ndsum[i]: total number of words in document i, size M
+
+ protected ArrayList> nw_inf = null; // nw[m][i][j]: number of instances of word/term i assigned to topic j in doc m, size M x V x K
+ protected int[][] nwsum_inf = null; // nwsum[m][j]: total number of words assigned to topic j in doc m, size M x K
+
+ // temp variables for sampling
+ protected double[] p = null;
+
+ //---------------------------------------------------------------
+ // Constructors
+ //---------------------------------------------------------------
+
+ public Model(LDACmdOption option) throws FileNotFoundException, IOException
+ {
+ this(option, null);
+ }
+
+ public Model(LDACmdOption option, Model trnModel) throws FileNotFoundException, IOException
+ {
+ modelName = option.modelName;
+ K = option.K;
+
+ alpha = option.alpha;
+ if (alpha < 0.0)
+ alpha = 50.0 / K;
+
+ if (option.beta >= 0)
+ beta = option.beta;
+
+ niters = option.niters;
+ nburnin = option.nburnin;
+ samplingLag = option.samplingLag;
+
+ dir = option.dir;
+ if (dir.endsWith(File.separator))
+ dir = dir.substring(0, dir.length() - 1);
+
+ dfile = option.dfile;
+ unlabeled = option.unlabeled;
+ twords = option.twords;
+
+ // initialize dataset
+ data = new LDADataset();
+
+ // process trnModel (if given)
+ if (trnModel != null) {
+ data.setDictionary(trnModel.data.localDict);
+ K = trnModel.K;
+
+ // use hyperparameters from model (if not overridden in options)
+ if (option.alpha < 0.0)
+ alpha = trnModel.alpha;
+ if (option.beta < 0.0)
+ beta = trnModel.beta;
+ }
+
+ // read in data
+ data.readDataSet(dir + File.separator + dfile, unlabeled);
+ }
+
+ //---------------------------------------------------------------
+ // Init Methods
+ //---------------------------------------------------------------
+
+ /**
+ * Init parameters for estimation or inference
+ */
+ public boolean init(boolean random)
+ {
+ if (random) {
+ M = data.M;
+ V = data.V;
+ z = new TIntArrayList[M];
+ } else {
+ if (!loadModel()) {
+ System.out.println("Fail to load word-topic assignment file of the model!");
+ return false;
+ }
+
+ // debug output
+ System.out.println("Model loaded:");
+ System.out.println("\talpha:" + alpha);
+ System.out.println("\tbeta:" + beta);
+ System.out.println("\tK:" + K);
+ System.out.println("\tM:" + M);
+ System.out.println("\tV:" + V);
+ }
+
+ p = new double[K];
+
+ initSS();
+
+ for (int m = 0; m < data.M; m++){
+ if (random) {
+ z[m] = new TIntArrayList();
+ }
+
+ // initilize for z
+ int N = data.docs.get(m).length;
+ for (int n = 0; n < N; n++){
+ int w = data.docs.get(m).words[n];
+ int topic;
+
+ // random init a topic or load existing topic from z[m]
+ if (random) {
+ topic = (int)Math.floor(Math.random() * K);
+ z[m].add(topic);
+ } else {
+ topic = z[m].get(n);
+ }
+
+ nw[w][topic]++; // number of instances of word assigned to topic j
+ nd[m][topic]++; // number of words in document i assigned to topic j
+ nwsum[topic]++; // total number of words assigned to topic j
+ }
+
+ ndsum[m] = N; // total number of words in document i
+ }
+
+ theta = new double[M][K];
+ phi = new double[K][V];
+
+ return true;
+ }
+
+ public boolean initInf()
+ {
+ nw_inf = new ArrayList>();
+
+ nwsum_inf = new int[M][K];
+ for (int m = 0; m < M; m++) {
+ for (int k = 0; k < K; k++) {
+ nwsum_inf[m][k] = 0;
+ }
+ }
+
+ for (int m = 0; m < data.M; m++){
+ nw_inf.add(m, new TIntObjectHashMap());
+
+ // initilize for z
+ int N = data.docs.get(m).length;
+ for (int n = 0; n < N; n++){
+ int w = data.docs.get(m).words[n];
+ int topic = z[m].get(n);
+
+ if (!nw_inf.get(m).containsKey(w)) {
+ int[] nw_inf_m_w = new int[K];
+ for (int k = 0; k < K; k++) {
+ nw_inf_m_w[k] = 0;
+ }
+ nw_inf.get(m).put(w, nw_inf_m_w);
+ }
+
+ nw_inf.get(m).get(w)[topic]++; // number of instances of word assigned to topic j in doc m
+ //nw_inf[m][w][topic]++; // number of instances of word assigned to topic j in doc m
+ nwsum_inf[m][topic]++; // total number of words assigned to topic j in doc m
+ }
+ }
+
+ return true;
+ }
+
+ /**
+ * Init sufficient stats
+ */
+ protected void initSS()
+ {
+ nw = new int[V][K];
+ for (int w = 0; w < V; w++){
+ for (int k = 0; k < K; k++){
+ nw[w][k] = 0;
+ }
+ }
+
+ nd = new int[M][K];
+ for (int m = 0; m < M; m++){
+ for (int k = 0; k < K; k++){
+ nd[m][k] = 0;
+ }
+ }
+
+ nwsum = new int[K];
+ for (int k = 0; k < K; k++){
+ nwsum[k] = 0;
+ }
+
+ ndsum = new int[M];
+ for (int m = 0; m < M; m++){
+ ndsum[m] = 0;
+ }
+ }
+
+ //---------------------------------------------------------------
+ // Update Methods
+ //---------------------------------------------------------------
+
+ public void updateParams()
+ {
+ updateTheta();
+ updatePhi();
+ numSamples++;
+ }
+ public void updateParams(Model trnModel)
+ {
+ updateTheta();
+ updatePhi(trnModel);
+ numSamples++;
+ }
+
+ public void updateTheta()
+ {
+ double Kalpha = K * alpha;
+ for (int m = 0; m < M; m++) {
+ for (int k = 0; k < K; k++) {
+ if (numSamples > 1) theta[m][k] *= numSamples - 1; // convert from mean to sum
+ theta[m][k] += (nd[m][k] + alpha) / (ndsum[m] + Kalpha);
+ if (numSamples > 1) theta[m][k] /= numSamples; // convert from sum to mean
+ }
+ }
+ }
+
+ public void updatePhi()
+ {
+ double Vbeta = V * beta;
+ for (int k = 0; k < K; k++) {
+ for (int w = 0; w < V; w++) {
+ if (numSamples > 1) phi[k][w] *= numSamples - 1; // convert from mean to sum
+ phi[k][w] += (nw[w][k] + beta) / (nwsum[k] + Vbeta);
+ if (numSamples > 1) phi[k][w] /= numSamples; // convert from sum to mean
+ }
+ }
+ }
+
+ // for inference
+ public void updatePhi(Model trnModel)
+ {
+ double Vbeta = trnModel.V * beta;
+ for (int k = 0; k < K; k++) {
+ for (int _w = 0; _w < V; _w++) {
+ if (data.lid2gid.containsKey(_w)) {
+ int id = data.lid2gid.get(_w);
+
+ if (numSamples > 1) phi[k][_w] *= numSamples - 1; // convert from mean to sum
+ phi[k][_w] += (trnModel.nw[id][k] + nw[_w][k] + beta) / (trnModel.nwsum[k] + nwsum[k] + Vbeta);
+ if (numSamples > 1) phi[k][_w] /= numSamples; // convert from sum to mean
+ } // else ignore words that don't appear in training
+ } //end foreach word
+ } // end foreach topic
+ }
+
+ //---------------------------------------------------------------
+ // I/O Methods
+ //---------------------------------------------------------------
+
+ /**
+ * Save model
+ */
+ public boolean saveModel()
+ {
+ return saveModel("");
+ }
+ public boolean saveModel(String modelPrefix)
+ {
+ if (!saveModelTAssign(dir + File.separator + modelPrefix + modelName + tassignSuffix)) {
+ return false;
+ }
+
+ if (!saveModelOthers(dir + File.separator + modelPrefix + modelName + othersSuffix)) {
+ return false;
+ }
+
+ if (!saveModelTheta(dir + File.separator + modelPrefix + modelName + thetaSuffix)) {
+ return false;
+ }
+
+ //if (!saveModelPhi(dir + File.separator + modelPrefix + modelName + phiSuffix)) {
+ // return false;
+ //}
+
+ if (twords > 0) {
+ if (!saveModelTwords(dir + File.separator + modelPrefix + modelName + twordsSuffix)) {
+ return false;
+ }
+ }
+
+ if (!data.localDict.writeWordMap(dir + File.separator + modelPrefix + modelName + wordMapSuffix)) {
+ return false;
+ }
+
+ return true;
+ }
+
+ /**
+ * Save word-topic assignments for this model
+ */
+ public boolean saveModelTAssign(String filename) {
+ int i, j;
+
+ try{
+ BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
+ new GZIPOutputStream(
+ new FileOutputStream(filename)), "UTF-8"));
+
+ //write docs with topic assignments for words
+ for (i = 0; i < data.M; i++) {
+ for (j = 0; j < data.docs.get(i).length; ++j) {
+ writer.write(data.docs.get(i).words[j] + ":" + z[i].get(j) + " ");
+ }
+ writer.write("\n");
+ }
+
+ writer.close();
+ }
+ catch (Exception e) {
+ System.out.println("Error while saving model tassign: " + e.getMessage());
+ e.printStackTrace();
+ return false;
+ }
+ return true;
+ }
+
+ /**
+ * Save theta (topic distribution) for this model
+ */
+ public boolean saveModelTheta(String filename) {
+ try{
+ BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
+ new GZIPOutputStream(
+ new FileOutputStream(filename)), "UTF-8"));
+
+ for (int i = 0; i < M; i++) {
+ for (int j = 0; j < K; j++) {
+ if (theta[i][j] > 0) {
+ writer.write(j + ":" + theta[i][j] + " ");
+ }
+ }
+ writer.write("\n");
+ }
+ writer.close();
+ }
+ catch (Exception e){
+ System.out.println("Error while saving topic distribution file for this model: " + e.getMessage());
+ e.printStackTrace();
+ return false;
+ }
+ return true;
+ }
+
+ /**
+ * Save word-topic distribution
+ */
+ public boolean saveModelPhi(String filename)
+ {
+ try {
+ BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
+ new GZIPOutputStream(
+ new FileOutputStream(filename)), "UTF-8"));
+
+ for (int i = 0; i < K; i++) {
+ for (int j = 0; j < V; j++) {
+ if (phi[i][j] > 0) {
+ writer.write(j + ":" + phi[i][j] + " ");
+ }
+ }
+ writer.write("\n");
+ }
+ writer.close();
+ }
+ catch (Exception e) {
+ System.out.println("Error while saving word-topic distribution:" + e.getMessage());
+ e.printStackTrace();
+ return false;
+ }
+ return true;
+ }
+
+ /**
+ * Save other information of this model
+ */
+ public boolean saveModelOthers(String filename){
+ try{
+ BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
+ new GZIPOutputStream(
+ new FileOutputStream(filename)), "UTF-8"));
+
+ writer.write("alpha=" + alpha + "\n");
+ writer.write("beta=" + beta + "\n");
+ writer.write("ntopics=" + K + "\n");
+ writer.write("ndocs=" + M + "\n");
+ writer.write("nwords=" + V + "\n");
+ writer.write("liters=" + liter + "\n");
+
+ writer.close();
+ }
+ catch(Exception e){
+ System.out.println("Error while saving model others:" + e.getMessage());
+ e.printStackTrace();
+ return false;
+ }
+ return true;
+ }
+
+ /**
+ * Save model the most likely words for each topic
+ */
+ public boolean saveModelTwords(String filename){
+ try{
+ BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
+ new GZIPOutputStream(
+ new FileOutputStream(filename)), "UTF-8"));
+
+ if (twords > V){
+ twords = V;
+ }
+
+ for (int k = 0; k < K; k++){
+ ArrayList wordsProbsList = new ArrayList();
+ for (int w = 0; w < V; w++){
+ Pair p = new Pair(w, phi[k][w], false);
+
+ wordsProbsList.add(p);
+ }//end foreach word
+
+ //print topic
+ writer.write("Topic " + k + ":\n");
+ Collections.sort(wordsProbsList);
+
+ for (int i = 0; i < twords; i++){
+ if (data.localDict.contains((Integer)wordsProbsList.get(i).first)){
+ String word = data.localDict.getWord((Integer)wordsProbsList.get(i).first);
+
+ writer.write("\t" + word + "\t" + wordsProbsList.get(i).second + "\n");
+ }
+ }
+ } //end foreach topic
+
+ writer.close();
+ }
+ catch(Exception e){
+ System.out.println("Error while saving model twords: " + e.getMessage());
+ e.printStackTrace();
+ return false;
+ }
+ return true;
+ }
+
+ /**
+ * Load saved model
+ */
+ public boolean loadModel(){
+ if (!readOthersFile(dir + File.separator + modelName + othersSuffix))
+ return false;
+
+ if (!readTAssignFile(dir + File.separator + modelName + tassignSuffix))
+ return false;
+
+ // read dictionary
+ Dictionary dict = new Dictionary();
+ if (!dict.readWordMap(dir + File.separator + modelName + wordMapSuffix))
+ return false;
+
+ data.localDict = dict;
+
+ return true;
+ }
+
+ /**
+ * Load "others" file to get parameters
+ */
+ protected boolean readOthersFile(String otherFile){
+ try {
+ BufferedReader reader = new BufferedReader(new InputStreamReader(
+ new GZIPInputStream(
+ new FileInputStream(otherFile)), "UTF-8"));
+ String line;
+ while((line = reader.readLine()) != null){
+ StringTokenizer tknr = new StringTokenizer(line,"= \t\r\n");
+
+ int count = tknr.countTokens();
+ if (count != 2)
+ continue;
+
+ String optstr = tknr.nextToken();
+ String optval = tknr.nextToken();
+
+ if (optstr.equalsIgnoreCase("alpha")){
+ alpha = Double.parseDouble(optval);
+ }
+ else if (optstr.equalsIgnoreCase("beta")){
+ beta = Double.parseDouble(optval);
+ }
+ else if (optstr.equalsIgnoreCase("ntopics")){
+ K = Integer.parseInt(optval);
+ }
+ else if (optstr.equalsIgnoreCase("liter")){
+ liter = Integer.parseInt(optval);
+ }
+ else if (optstr.equalsIgnoreCase("nwords")){
+ V = Integer.parseInt(optval);
+ }
+ else if (optstr.equalsIgnoreCase("ndocs")){
+ M = Integer.parseInt(optval);
+ }
+ else {
+ // any more?
+ }
+ }
+
+ reader.close();
+ }
+ catch (Exception e){
+ System.out.println("Error while reading other file:" + e.getMessage());
+ e.printStackTrace();
+ return false;
+ }
+ return true;
+ }
+
+ /**
+ * Load word-topic assignments for this model
+ */
+ protected boolean readTAssignFile(String tassignFile)
+ {
+ try {
+ int i,j;
+ BufferedReader reader = new BufferedReader(new InputStreamReader(
+ new GZIPInputStream(
+ new FileInputStream(tassignFile)), "UTF-8"));
+
+ String line;
+ z = new TIntArrayList[M];
+ data = new LDADataset();
+ data.setM(M);
+ data.V = V;
+ for (i = 0; i < M; i++){
+ line = reader.readLine();
+ StringTokenizer tknr = new StringTokenizer(line, " \t\r\n");
+
+ int length = tknr.countTokens();
+
+ TIntArrayList words = new TIntArrayList();
+ TIntArrayList topics = new TIntArrayList();
+ for (j = 0; j < length; j++){
+ String token = tknr.nextToken();
+
+ StringTokenizer tknr2 = new StringTokenizer(token, ":");
+ if (tknr2.countTokens() != 2){
+ System.out.println("Invalid word-topic assignment line\n");
+ return false;
+ }
+
+ words.add(Integer.parseInt(tknr2.nextToken()));
+ topics.add(Integer.parseInt(tknr2.nextToken()));
+ }//end for each topic assignment
+
+ //allocate and add new document to the corpus
+ Document doc = new Document(words);
+ data.setDoc(doc, i);
+
+ //assign values for z
+ z[i] = new TIntArrayList();
+ for (j = 0; j < topics.size(); j++){
+ z[i].add(topics.get(j));
+ }
+
+ }//end for each doc
+
+ reader.close();
+ }
+ catch (Exception e){
+ System.out.println("Error while loading model: " + e.getMessage());
+ e.printStackTrace();
+ return false;
+ }
+ return true;
+ }
+}
diff --git a/java_LabledLDA/src/jgibblda/Pair.java b/java_LabledLDA/src/jgibblda/Pair.java
new file mode 100644
index 0000000..0d4d4cb
--- /dev/null
+++ b/java_LabledLDA/src/jgibblda/Pair.java
@@ -0,0 +1,54 @@
+/*
+ * Copyright (C) 2007 by
+ *
+ * Xuan-Hieu Phan
+ * hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com
+ * Graduate School of Information Sciences
+ * Tohoku University
+ *
+ * Cam-Tu Nguyen
+ * ncamtu@gmail.com
+ * College of Technology
+ * Vietnam National University, Hanoi
+ *
+ * JGibbsLDA is a free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License,
+ * or (at your option) any later version.
+ *
+ * JGibbsLDA is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with JGibbsLDA; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ */
+
+package jgibblda;
+
+import java.util.Comparator;
+
+public class Pair implements Comparable {
+ public Object first;
+ public Comparable second;
+ public static boolean naturalOrder = false;
+
+ public Pair(Object k, Comparable v){
+ first = k;
+ second = v;
+ }
+
+ public Pair(Object k, Comparable v, boolean naturalOrder){
+ first = k;
+ second = v;
+ Pair.naturalOrder = naturalOrder;
+ }
+
+ public int compareTo(Pair p){
+ if (naturalOrder)
+ return this.second.compareTo(p.second);
+ else return -this.second.compareTo(p.second);
+ }
+}
diff --git a/old/testo.py b/old/testo.py
new file mode 100644
index 0000000..5a990f6
--- /dev/null
+++ b/old/testo.py
@@ -0,0 +1,199 @@
+# -*- coding: utf-8 -*-
+import functools
+import re
+
+import spacy
+import textacy
+from spacy.tokens import Doc
+from spacy.tagger import Tagger
+
+import xml.etree.ElementTree as ET
+
+PARSER = spacy.load('de')
+stop_words = list(__import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)
+
+def compose(*functions):
+ def compose2(f, g):
+ return lambda x: f(g(x))
+ return functools.reduce(compose2, functions, lambda x: x)
+
+
+def cleanTexts(textstream, parser, attr):
+
+ #input str-stream output str-stream
+ pipe = parser.pipe(textstream)
+
+ for doc in pipe:
+
+ tokens = [tok.text for tok in doc
+ if tok.pos_ not in attr
+ and tok.tag_ not in attr
+ and tok.ent_ not in attr
+ and tok.text not in attr
+ and tok.lower_ not in attr]
+
+
+ yield " ".join(tokens)
+
+
+ """
+def cleanDoc_lemmatize(doc,parser=PARSER):
+ return parser(" ".join([tok.lemma_ for tok in doc ]))
+
+
+def cleanDoc_STOPS(doc,parser=PARSER, stop_words=None, keep=None):
+ if stop_words is None:
+ stop_words = list(__import__("spacy." + parser.lang, globals(), locals(), ['object']).STOP_WORDS)
+
+ if hasattr(keep, '__iter__'):
+ for k in keep:
+ try:
+ stop_words.remove(k)
+ except ValueError:
+ pass
+
+ return parser(" ".join([tok.text for tok in doc if tok.text not in stop_words]))
+
+
+
+def cleanDoc_ENT(doc,parser=PARSER, keeponly=False, attr=["WORK_OF_ART", "ORG", "PRODUCT", "LOC"]):
+ if keeponly:
+ return parser(" ".join([tok.text for tok in doc if tok.ent_ in attr]))
+ else:
+ return parser(" ".join([tok.text for tok in doc if tok.ent_ not in attr]))
+
+
+
+def cleanDoc_POS(doc,parser=PARSER, keeponly=False, attr=["SPACE", "PUNCT"]):
+ if keeponly:
+ return parser(" ".join([tok.text for tok in doc if tok.pos_ in attr]))
+ else:
+ return parser(" ".join([tok.text for tok in doc if tok.pos_ not in attr]))
+"""
+
+
+def cleanTexts_POS(spacypipe, keeponly=False, attr=["SPACE", "PUNCT"]):
+ """
+ :param spacypipe: spacypipe
+ :param keeponly: bool . If True, only attr will be kept. If false, all attr will be deleted
+ :param attr: [str] pos_ or ent_type_
+ :yields: stream of strings: full-length cleaned text
+ """
+ if keeponly:
+ for doc in spacypipe:
+ yield " ".join([tok.text for tok in doc if tok.pos_ in attr])
+
+ else:
+ for doc in spacypipe:
+ yield " ".join([tok.text for tok in doc if tok.pos_ not in attr])
+
+def cleanText_POS(text,parser=PARSER, keeponly=False, attr=["SPACE", "PUNCT"]):
+ """
+ :param txt: str
+ :param keeponly: bool . If True, only attr will be kept. If false, all attr will be deleted
+ :param attr: [str] pos_ or ent_type_
+ :return: str
+ """
+ doc = parser(text)
+
+ if keeponly:
+ return " ".join([tok.text for tok in doc if tok.pos_ in attr])
+ else:
+ return " ".join([tok.text for tok in doc if tok.pos_ not in attr])
+
+
+def removeWhitespace(string):
+ return re.sub(r'(\r\n|\r|\n|(\s)+)', ' ', string)
+
+def removeWords(string, words):
+ big_regex = re.compile('|'.join(map(re.escape, words)))
+ return big_regex.sub("", string)
+
+
+
+
+
+
+def generateMainTextfromTicketXML(path2xml, main_textfield='Beschreibung', cleaning_function=None):
+ """
+ generates strings from XML
+ :param path2xml:
+ :param main_textfield:
+ :param cleaning_function:
+ :yields strings
+ """
+ import xml.etree.ElementTree as ET
+
+ tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
+ root = tree.getroot()
+
+
+ for ticket in root:
+ text = "ERROR"
+ for field in ticket:
+ if field.tag == main_textfield:
+ if cleaning_function:
+ text = cleaning_function(field.text)
+ else:
+ text = field.text
+ yield text
+
+def generateMetadatafromTicketXML(path2xml, key_function_pairs_to_clean, leave_out=['Beschreibung']):
+ import xml.etree.ElementTree as ET
+
+ tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
+
+ root = tree.getroot()
+
+ for ticket in root:
+ metadata = {}
+ for field in ticket:
+ if field.tag not in leave_out:
+
+ if field.tag in key_function_pairs_to_clean:
+ metadata[field.tag] = key_function_pairs_to_clean[field.tag](field.text)
+ else:
+ metadata[field.tag] = field.text
+
+ yield metadata
+
+
+
+
+string = "Frau Hinrichs überdenkt die tu Situation a@bc.de und 545453 macht ' dann neue Anträge. \n Dieses Ticket wird geschlossen \n \n test"
+
+#print(removeWords(string,["die", "neue"]))
+
+# in:str out:str
+cleanString = compose(
+ cleanText_POS,
+ functools.partial(textacy.preprocess.replace_emails, replace_with=u'EMAIL')
+)
+
+key_function_pairs_to_clean = {
+ "Loesung":removeWhitespace,
+ "Zusammenfassung":cleanText_POS
+}
+"""
+# in:str-gen out:str-gen
+cleanStream = compose(
+ removeSTOP,
+ lemmatize,
+ cleanEnt
+)
+"""
+# content: xml -> stringCleaning -> pipe -> docCleaning -> corpus
+# metadata:xml -> -> stringCleaning -> corpus
+
+corpus = textacy.Corpus(PARSER)
+
+
+
+
+corpus.add_texts(
+ cleanTexts(generateMainTextfromTicketXML("ticketSamples.xml"),PARSER,["PUNCT","SPACE","PERSON"])#,
+ #generateMetadatafromTicketXML("ticketSamples.xml",key_function_pairs_to_clean=key_function_pairs_to_clean)
+)
+
+print(corpus[0].text)
+