Java Code

import java.io.*;
import java.math.*;
import java.nio.file.*;
import java.nio.file.attribute.BasicFileAttributes;
import edu.upc.Jfreeling.*;
import static java.nio.file.FileVisitResult.CONTINUE;
import static java.nio.file.FileVisitResult.SKIP_SUBTREE;
import java.util.ArrayList;
import java.util.Collections;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import net.sf.extjwnl.*;
import net.sf.extjwnl.data.*;
import net.sf.extjwnl.data.list.*;
import net.sf.extjwnl.dictionary.*;
import net.sf.extjwnl.util.*;
import org.slf4j.LoggerFactory;

public class Sample extends SimpleFileVisitor {
private Path root;
private static final String OS = System.getProperty("os.name").toLowerCase();
public static final ArrayList<Long> MatriceFile=new ArrayList<Long>();
public static final ArrayList<Long> MatriceGlobale=new ArrayList<Long>();
public static String classename=null;
public static int Occurences=0;
public static int Filecount=0;
public static StringBuffer SB=new StringBuffer();
public static BufferedWriter BW;
static {
try {
BW = new BufferedWriter(new FileWriter("/home/ayto/Desktop/HyperTrainNOVA",true));
} catch (IOException e) {
e.printStackTrace();
}
}
public static String TestWord=new String();
public static String TestLexic=new String();
public static long N=0;
public static long N2=0;
public static Pattern pattern = Pattern.compile("(\\d+)[^\\d.\\d+/]");
public static Matcher match;
public static PointerTargetTree hypernyms;
public static PointerTargetTree hyponyms;
public static PointerTargetTree holonyms;
public static PointerTargetTree meronyms;
public static String offsets;
public static IndexWord mot=null;
public static Matcher match2;
public static net.sf.extjwnl.dictionary.Dictionary dico;

public static void main( String argv[] ) throws IOException, JWNLException {
org.slf4j.Logger logger = LoggerFactory.getLogger(Sample.class);
//Initialize JWNL
String propsFile="/home/ayto/Desktop/extjwnl-2.0.3/src/extjwnl/src/main/resources/net/sf/extjwnl/dictionary/file_properties.xml";
//dico=net.sf.extjwnl.dictionary.Dictionary.getDefaultResourceInstance();
dico=net.sf.extjwnl.dictionary.Dictionary.getInstance(new FileInputStream(propsFile));

// connect to FreeLing library
System.load("/usr/local/share/freeling/APIs/java/libJfreeling.so");

// Check whether we know where to find FreeLing data files
String FLDIR = System.getenv("FREELINGDIR");
if (FLDIR==null) {
if (OS.indexOf("win") >= 0) FLDIR = "C:\\Program Files";
else FLDIR = "/usr/local";
System.err.println("FREELINGDIR environment variable not defined, trying "+FLDIR);
}

File f = new File(FLDIR+"/share/freeling");
if (! f.exists()) {
System.err.println("Folder "+FLDIR+"/share/freeling not found.");
System.err.println("Please set FREELINGDIR environment variable to FreeLing installation directory");
System.exit(1);
}

// Location of FreeLing configuration files.
String DATA = FLDIR + "/share/freeling/";

// Init locales
Util.initLocale( "default" );

// Create options set for maco analyzer.
String LANG = "en";
MacoOptions op = new MacoOptions( LANG );

//Uncomment these to activate FreeLing debugging traces
//Traces.setTraceLevel(5);
//BigInteger bi = new BigInteger("000F",16);
//Traces.setTraceModule(bi);

op.setDataFiles( "",
DATA + "common/punct.dat",
DATA + LANG + "/dicc.src",
DATA + LANG + "/afixos.dat",
"",
DATA + LANG + "/locucions.dat",
DATA + LANG + "/np.dat",
DATA + LANG + "/quantities.dat",
DATA + LANG + "/probabilitats.dat");

// Create analyzers.

// language detector. Used just to show it. Results are printed
// but ignored (after, it is assumed language is LANG)
LangIdent lgid = new LangIdent(DATA + "/common/lang_ident/ident-few.dat");

Tokenizer tk = new Tokenizer( DATA + LANG + "/tokenizer.dat" );
Splitter sp = new Splitter( DATA + LANG + "/splitter.dat" );
SWIGTYPE_p_splitter_status sid = sp.openSession();

Maco mf = new Maco( op );
mf.setActiveOptions(false, true, true, true, // select which among created
true, true, false, true, // submodules are to be used.
true, true, true, true); // default: all created submodules
// are used

HmmTagger tg = new HmmTagger( DATA + LANG + "/tagger.dat", true, 2 );
ChartParser parser = new ChartParser( DATA + LANG + "/chunker/grammar-chunk.dat" );

// Uncomment this for rule-based dependency parsing
// DepTxala dep = new DepTxala( DATA + LANG + "/dep_txala/dependences.dat", parser.getStartSymbol() );

// Uncomment this for ML-based statistical dependency parsing
//DepTreeler dep = new DepTreeler( DATA + LANG + "/treeler/dependences.dat" );

// Uncomment this for LSTM-based statistical dependency parsing
DepLstm dep = new DepLstm( DATA + LANG + "/dep_lstm/params-en.dat" );

Nec neclass = new Nec( DATA + LANG + "/nerc/nec/nec-ab-poor1.dat" );

Senses sen = new Senses(DATA + LANG + "/senses.dat" ); // sense dictionary
Ukb dis = new Ukb( DATA + LANG + "/ukb.dat" ); // sense disambiguator

Path root= Paths.get("/home/ayto/Desktop/TESTCORPUS/train");
//Path root= Paths.get("/home/linux/Desktop/TESTCORPUS/train");
Files.walkFileTree(root, new SimpleFileVisitor<Path>(){

/*public FileVisitResult preVisitDirectory (Path file, BasicFileAttributes attr) throws IOException{
if (file.getFileName().toString().equals("train"))
return SKIP_SUBTREE;
else return CONTINUE;
}*/
public FileVisitResult visitFile(Path file, BasicFileAttributes attr) throws IOException {

InputStream IS=new FileInputStream(String.valueOf(file));
// Make sure the encoding matches your input text (utf-8, iso-8859-15, ...)
BufferedReader input = new BufferedReader(
new InputStreamReader(IS));
String line = input.readLine();

// Identify language of the text.
// Note that this will identify the language, but will NOT adapt
// the analyzers to the detected language. All the processing
// in the loop below is done by modules for LANG (set to "es" at
// the beggining of this class) created above.
String lg = lgid.identifyLanguage(line);
System.out.println("CURRENT FILE: "+ file.getFileName());
System.out.println( "-------- LANG_IDENT results -----------" );
System.out.println("Language detected (from first line in text): " + lg);

while( line != null ) {
// Extract the tokens from the line of text.
ListWord l = tk.tokenize( line );

// Split the tokens into distinct sentences.
ListSentence ls = sp.split( sid, l, false );

// Perform morphological analysis
mf.analyze( ls );

// Perform part-of-speech tagging.
tg.analyze( ls );

// Perform named entity (NE) classificiation.
neclass.analyze( ls );

sen.analyze( ls );
dis.analyze( ls );

ListSentenceIterator sIt = new ListSentenceIterator(ls);
while (sIt.hasNext()) {
Sentence s = sIt.next();
ListWordIterator wIt = new ListWordIterator(s);
while (wIt.hasNext()) {
edu.upc.Jfreeling.Word w = wIt.next();
System.out.println(w.getForm());
if(!w.getSensesString().isEmpty()){
match=pattern.matcher(w.getSensesString());
while (match.find()){
TestWord=match.group(1);
N=Long.parseLong(TestWord);
if (!MatriceGlobale.contains(N))insert(N);
MatriceFile.add(N);
}
try {
mot=findingID(w.getTag(),w.getLemma());

//Hypernym(mot);
//Hyponym(mot);
//Holonym(w);
Meronym(w);
} catch (JWNLException ex) {
Logger.getLogger(Sample.class.getName()).log(Level.SEVERE, null, ex);
}
}
}
wIt.delete();
}
sIt.delete();

// Chunk parser
//parser.analyze( ls );
//printResults( ls, "parsed" );

// Dependency parser
//dep.analyze( ls );
//printResults( ls, "dep" );

line = input.readLine();
}
// flush any sentences lingering in the splitter buffer
System.out.println( "-------- Flushing buffers -----------" );
ListSentence ls = sp.split( sid, tk.tokenize(""), true );
mf.analyze( ls );
tg.analyze( ls );
neclass.analyze( ls );
sen.analyze( ls );
dis.analyze( ls );

if (file.getParent().getFileName().toString().equals("acq"))
classename="1 ";
else if (file.getParent().getFileName().toString().equals("corn"))
classename="2 ";
else if (file.getParent().getFileName().toString().equals("crude"))
classename="3 ";
else if (file.getParent().getFileName().toString().equals("earn"))
classename="4 ";
else if (file.getParent().getFileName().toString().equals("grain"))
classename="5 ";
else if (file.getParent().getFileName().toString().equals("interest"))
classename="6 ";
else if (file.getParent().getFileName().toString().equals("money-fx"))
classename="7 ";
else if (file.getParent().getFileName().toString().equals("ship"))
classename="8 ";
else if (file.getParent().getFileName().toString().equals("trade"))
classename="9 ";
else if (file.getParent().getFileName().toString().equals("wheat"))
classename="10 ";

SB.append(classename);
for (int i=0; i<MatriceGlobale.size(); i++){
Occurences=Collections.frequency(MatriceFile, MatriceGlobale.get(i));
if ( Occurences > 0 && MatriceGlobale.get(i)>0 && MatriceGlobale.get(i)<1000000000)
SB.append(MatriceGlobale.get(i)+":"+Occurences+" ");
}

SB.append(System.lineSeparator());
MatriceFile.clear();
Filecount++;
System.out.println("DONE: "+ Filecount+"/7193");
return FileVisitResult.CONTINUE;
}
});

sp.closeSession(sid);
System.out.println(Filecount);
BW.write(SB.toString());
BW.flush();
BW.close();
}

public static PointerTargetTreeNodeList.Operation opr=new PointerTargetTreeNodeList.Operation() {
@Override
public PointerTargetTreeNode execute(PointerTargetTreeNode testNode) {
return testNode;
}
};

private static IndexWord findingID (String tag, String lemme) throws JWNLException {
IndexWord ID=null;
if (tag.charAt(0)=='N')
ID=dico.getIndexWord(POS.NOUN,lemme);
else if (tag.charAt(0)=='V')
ID=dico.getIndexWord(POS.VERB,lemme);
else if (tag.charAt(0)=='J')
ID=dico.getIndexWord(POS.ADJECTIVE,lemme);
else if (tag.charAt(0)=='R')
ID=dico.getIndexWord(POS.ADVERB, lemme);
return ID;
}
public static void insert(long x){
// loop through all elements
for (int i = 0; i < MatriceGlobale.size(); i++) {
// if the element you are looking at is smaller than x,
// go to the next element
if (MatriceGlobale.get(i) < x) continue;
// if the element equals x, return, because we don't add duplicates
if (MatriceGlobale.get(i) == x) return;
// otherwise, we have found the location to add x
MatriceGlobale.add(i, x);
return;
}
// we looked through all of the elements, and they were all
// smaller than x, so we add ax to the end of the list
MatriceGlobale.add(x);
}

public static void Hypernym (IndexWord mot) throws JWNLException{
if (mot==null) return;
hypernyms=PointerUtils.getHypernymTree(mot.getSenses().get(0),5);
TestLexic=String.valueOf(hypernyms.getAllMatches(opr));
match2=pattern.matcher(TestLexic);
while (match2.find()){
offsets=match2.group(1);
N2=Long.parseLong(offsets);
if (!MatriceGlobale.contains(N2)){
insert(N2);
}
MatriceFile.add(N2);
}
}
public static void Hyponym (IndexWord mot) throws JWNLException{
if (mot==null) return;
hyponyms=PointerUtils.getHyponymTree(mot.getSenses().get(0), 2);
TestLexic=String.valueOf(hyponyms.getAllMatches(opr));
//System.out.println(TestLexic);
match2=pattern.matcher(TestLexic);
while (match2.find()){
offsets=match2.group(1);
N2=Long.parseLong(offsets);
if (!MatriceGlobale.contains(N2)){
insert(N2);
}
MatriceFile.add(N2);
}
}

public static void Holonym (edu.upc.Jfreeling.Word w) throws JWNLException{
mot=findingID(w.getTag(),w.getLemma());
holonyms=PointerUtils.getInheritedHolonyms(mot.getSenses().get(0),3,1);
TestLexic=String.valueOf(holonyms.getAllMatches(opr));
match2=pattern.matcher(TestLexic);
while (match2.find()){
offsets=match2.group(1);
N2=Long.parseLong(offsets);
if (!MatriceGlobale.contains(N2)){
insert(N2);
}
MatriceFile.add(N2);
}
}

public static void Meronym (edu.upc.Jfreeling.Word w) throws JWNLException{
mot=findingID(w.getTag(),w.getLemma());
meronyms=PointerUtils.getInheritedMeronyms(mot.getSenses().get(0),3,1);
TestLexic=String.valueOf(meronyms.getAllMatches(opr));
match2=pattern.matcher(TestLexic);
while (match2.find()){
offsets=match2.group(1);
N2=Long.parseLong(offsets);
if (!MatriceGlobale.contains(N2)){
insert(N2);
}
MatriceFile.add(N2);
}
}
}