Squashed initial commit

2024-09-10 13:47:29 -04:00
commit 8ebb6ad265
6221 changed files with 2512206 additions and 0 deletions
--- a/qhtcp-workflow/apps/java/weka-clustering/src/ExecMain.java
+++ b/qhtcp-workflow/apps/java/weka-clustering/src/ExecMain.java
@@ -0,0 +1,102 @@
+/*
+ * This is the main class to execute the computation and get the GOID.
+ */
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.OutputStreamWriter;
+import java.util.Vector;
+import weka.core.Instances;
+
+/**
+ *
+ * @author DTian
+ */
+public class ExecMain {
+
+	public static void main(String[] args) {
+        
+// SGD2AttrTable sgd = new SGD2AttrTable();
+// Get clusters
+
+    try {
+			// Check the input arguments
+			checkParameters(args.length);
+      GetClusters myGetClusters = new GetClusters();
+	
+			// To get the input file type
+			Instances oriData = myGetClusters.input(args[0]);
+
+			// Get the output files name
+			String outputWholeCluster = args[0] + "-WholeTree.txt";
+			String outputFinalTable = args[0] + "-finalTable.csv";
+			String outputSummary = args[0] + "-summary.csv";
+      boolean fromFunction = true?false:args[4].trim().compareToIgnoreCase("true") == 0;
+
+			// Create the root cluster name
+			int round = 1; //for tree level
+			int position = 0; // for node position in same level
+			String rootName = (round-1)+"-"+position+"-"+0;
+
+			// System.out.println("root cluster is:" + rootName);
+			myGetClusters.printRootName(outputWholeCluster,rootName);
+
+			// Create a vector for fianl table
+			Vector vecFinalTable = new Vector();
+			Vector vecSummary = new Vector();
+
+			//get the variable name
+			vecFinalTable.addElement(myGetClusters.printTableHead(oriData));
+
+			// Create the root node
+			TreeNode root = new TreeNode(rootName,0.0,oriData,null);
+			OutputStreamWriter xmlWriter = new OutputStreamWriter(
+					new FileOutputStream(new File("tree.xml"), true));
+			xmlWriter.write(" <tree>\n <declarations> \n <attributeDecl name=\"name\" type=\"String\"/>\n </declarations>\n");
+			xmlWriter.write(" <branch>\n  <attribute name=\"name\" value=\""+"root" +"\"/>\n");
+			xmlWriter.flush();
+
+			// Recursive clustering the data
+			myGetClusters.clustering(
+				root,
+				round,
+				""+position,
+				vecFinalTable,
+				vecSummary,
+				outputWholeCluster,
+        xmlWriter,
+				args[1],
+				args[2],
+				args[3], 
+				fromFunction
+			);
+
+			xmlWriter.write(" </branch>\n");
+			xmlWriter.write("</tree>\n");
+			xmlWriter.close();
+
+			// Output final result
+			myGetClusters.printVector(vecFinalTable,outputFinalTable);
+			myGetClusters.printVector(vecSummary,outputSummary);
+		} catch (Exception e) {
+			e.printStackTrace();
+			System.exit(1);
+		}
+  }
+
+	/**
+	* check the number of the arguments:
+	* java GetCluster arg1 arg2 ...
+	*
+	* @param length  the length of the arguments
+	* in this program, length should be 1
+	*/
+	private static void checkParameters(int length) {
+		if(length != 5) {
+			// there are 5 parameters, 1,file for clustering; 2, lookup table file;
+			// 3, backgroud file; 4, repeat counts(an integer) 5, GoIDfromFunction (boolean)
+			System.out.println("Usage: java  ExecMain clusterFileName GoMatrixFilename backGroundFilename repeatTime true|false");
+			System.exit(1);
+		}
+	}
+}
--- a/qhtcp-workflow/apps/java/weka-clustering/src/GetClusters.java
+++ b/qhtcp-workflow/apps/java/weka-clustering/src/GetClusters.java
@@ -0,0 +1,532 @@
+
+/**
+ * This program will take an input file(either in arff format or csv format).
+ * outout 3 files: one is the tree structure. another is the final table with
+ * all information, the last one is the summary information
+ * 
+ */
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.util.Vector;
+
+import weka.clusterers.ClusterEvaluation;
+import weka.clusterers.EM;
+import weka.core.Instances;
+
+public class GetClusters {
+
+    public GetClusters() {
+    }
+
+    /**
+     *
+     * @param root 	 the tree node we need to cluster
+     * @param generation	 the depth of the tree
+     * @param position		 the breadth of the tree
+     * @param vecFinalTable	 contain the final table
+     */
+    public int clustering(TreeNode root, int generation,
+            String position, Vector vecFinalTable,
+            Vector vecSummary, String outputFilename, OutputStreamWriter xmlWriter,
+            String lookupFile, String backgroundFile, String count, boolean fromFunction) {
+        int result = 0;
+        try {
+
+            FileOutputStream stream;// provides file access
+            OutputStreamWriter writer;// writes to the file
+            stream = new FileOutputStream(new File(outputFilename), true);
+            writer = new OutputStreamWriter(stream);
+
+            // ***** 1 create a copy of original data *****
+            Instances oriData = root.getData();
+            Instances data = new Instances(oriData);
+
+            // ***** 2 remove attribute: orf_name(string attribute) *****
+//			data.deleteAttributeAt(0);
+            data.deleteStringAttributes();
+
+            // ***** 3 clustering *****
+            EM clusterer = new EM(); // new instance of clusterer
+            clusterer.buildClusterer(data); // build the clusterer
+
+            // evaluate cluster
+            ClusterEvaluation eval = new ClusterEvaluation();
+            eval.setClusterer(clusterer); // the cluster to evaluate
+            eval.evaluateClusterer(data); // data to evaluate the clusterer on
+
+            //get the rawGoID and zScore for the
+            //to be continued. AAA
+            String[] clusterNames = getClusterNames(oriData);
+            double[] goID = null;
+            if(fromFunction){
+                goID = this.getGoID(clusterNames, lookupFile, backgroundFile,count);
+            }else{
+                goID = this.getGoIDFromFunc(clusterNames, lookupFile, backgroundFile,count);
+            }
+
+            double logLikelihood = eval.getLogLikelihood();
+            writer.write("logLikelihood is: " + logLikelihood + "\n");
+            writer.write("GoID is: " + goID[0] + "\n");
+            writer.write("zScore is: " + goID[1] + "\n\n");
+            writer.flush();
+
+            // ***** 4 get the sub clusters *****
+            int numberOfSubCluster = eval.getNumClusters();
+            if (numberOfSubCluster > 1) {// not an end node
+
+                // create numberOfSubCluster instances array to store sub
+                // clusters
+                Instances[] subData = new Instances[numberOfSubCluster];
+                TreeNode[] subNode = new TreeNode[numberOfSubCluster];
+                for (int i = 0; i < numberOfSubCluster; i++) {
+                    subData[i] = new Instances(oriData);
+                    subData[i].delete();// keep only data head(attributes part)
+                }
+                // //System.out.println("\nlength is: " + data.numInstances());
+                // //System.out.println("number of clusters: " +
+                // numberOfSubCluster);
+
+                // //System.out.println(eval.clusterResultsToString());
+                double[] dArray = eval.getClusterAssignments();
+                for (int i = 0; i < dArray.length; i++) {
+                    int clusterNumber = (int) dArray[i];
+                    // //System.out.println("\ngene " + i + " is in cluster: "
+                    // + clusterNumber + ",\tlog likelihood is:"
+                    // + eval.getLogLikelihood());
+                    // //System.out.println("***************");
+
+                    // assign each gene to according cluster
+                    for (int j = 0; j < subData.length; j++) {
+                        if (j == clusterNumber) {
+                            subData[j].add(oriData.instance(i));
+                        }
+                    }// end of inner j loop
+                }// end of outter i loop
+
+
+                // ***** 5 recursive call *****
+                String uniName = "";
+//				for (int i = 0; i <= generation; i++) {
+//					uniName += "0";
+//				}
+                uniName += generation + "-" + position;
+                generation++;
+                for (int i = 0; i < numberOfSubCluster; i++) {
+                    String name = uniName + "-" + i;
+                    //System.out.println("\n******************************");
+                    //System.out.println("cluster name: " + name);
+                    writer.write("\n******************************\n");
+                    writer.write("cluster name: " + name + "\n");
+                    writer.flush();
+                    xmlWriter.write(" <branch>\n  <attribute name=\"name\" value=\"" + name + "\"/>\n");
+                    xmlWriter.flush();
+
+                    subNode[i] = new TreeNode(name, eval.getLogLikelihood(),
+                            subData[i], root);
+                    result += clustering(subNode[i], generation,
+                            position + "." + i, vecFinalTable, vecSummary, outputFilename,
+                            xmlWriter,lookupFile,backgroundFile,count,fromFunction);
+                    xmlWriter.write(" </branch>\n");
+                    xmlWriter.flush();
+                }// end of for loop
+            } else { //for leaf node
+                //System.out.println("leaf node");
+                result = 1;
+                int temp = 1;
+                if (!vecSummary.isEmpty()) {
+                    String strT = (vecSummary.lastElement().toString()).split(",")[1];
+                    temp = Integer.parseInt(strT.trim()) + 1;
+                }
+                writer.write("leaf node\n");
+                writer.flush();
+
+                for (int i = 0; i < root.getData().numInstances(); i++) {
+                    String strTemp = eval.getLogLikelihood() + "," + root.getData().instance(i) + "," + getAncestor(root, false) + "," + temp;
+                    //System.out.println( strTemp);
+                    writer.write(strTemp + "\n");
+                    writer.flush();
+                    xmlWriter.write("<leaf>\n <attribute name=\"name\" value=\"" + root.getData().instance(i).stringValue(0) + "\"/>\n</leaf>\n");
+                    xmlWriter.flush();
+                    vecFinalTable.addElement(strTemp);
+                }
+
+                vecSummary.addElement(getAncestor(root, false).toString() + "," + temp + "," + root.getData().numInstances() + "," + logLikelihood);
+                //System.out.println("******************************\n");
+                writer.write("******************************\n");
+                writer.flush();
+                generation--;
+            }//end of else
+
+
+            writer.close();
+            stream.close();
+
+        } catch (Exception e) {
+            // TODO Auto-generated catch block
+            e.printStackTrace();
+        }
+        return result;
+    }//end of method "clustering"
+
+    /**
+     * output the root cluster name to file
+     * @param fileName  output file name
+     * @param rootName	thr root cluster name
+     */
+    public void printRootName(String fileName, String rootName) {
+        try {
+            FileOutputStream stream;// provides file access
+            OutputStreamWriter writer;// writes to the file
+            stream = new FileOutputStream(new File(fileName), true);
+            writer = new OutputStreamWriter(stream);
+            writer.write("root cluster is:" + rootName + "\n");
+            writer.flush();
+            writer.close();
+            stream.close();
+        } catch (FileNotFoundException e) {
+            // TODO Auto-generated catch block
+            e.printStackTrace();
+        } catch (IOException e) {
+            // TODO Auto-generated catch block
+            e.printStackTrace();
+        }
+    }
+
+    /**
+     * print out the instance part of the data into a CSV formated table.
+     *
+     * @param data: the printed data set
+     */
+    public String printTableHead(Instances data) {
+        String strResult = "likelihood";
+        for (int i = 0; i < data.numAttributes(); i++) {
+            String strTemp = "";
+            String[] strArr = data.attribute(i).toString().split("\\ ");
+            for (int j = 1; j < strArr.length - 1; j++) {
+                strTemp += strArr[j];
+            }
+            strResult += "," + strTemp;
+        }
+
+        return strResult + ",cluster origin,cluster ID";
+    }//end of method "printTalbe"
+
+    /**
+     * print the vector
+     * @param vec
+     */
+    public void printVector(Vector vec, String outputFilename) {
+        //System.out.println("\n***************************");
+        //System.out.println("***   final    result   ***");
+        //System.out.println("***************************");
+
+        try {
+            FileOutputStream stream;// provides file access
+            OutputStreamWriter writer;// writes to the file
+            stream = new FileOutputStream(new File(outputFilename), false);
+            writer = new OutputStreamWriter(stream);
+
+            for (int i = 0; i < vec.size(); i++) {
+                //System.out.println(vec.elementAt(i));
+                writer.write(vec.elementAt(i).toString() + "\n");
+            }
+
+            writer.close();
+            stream.close();
+        } catch (FileNotFoundException e) {
+            // TODO Auto-generated catch block
+            e.printStackTrace();
+        } catch (IOException e) {
+            // TODO Auto-generated catch block
+            e.printStackTrace();
+        }
+
+        //System.out.println("\n***************************");
+        //System.out.println("*** end of final result ***");
+        //System.out.println("***************************");
+    }
+
+    /**
+     *
+     * @param endNode  an leaf node
+     * @return a string contains all the ancestor's name of the node
+     */
+    public String getAncestor(TreeNode endNode, boolean fromLeafNode) {
+        String strResult = endNode.getStrName();
+        TreeNode tempNode = endNode;
+        while (tempNode.getParent() != null) {
+            tempNode = tempNode.getParent();
+            strResult += "; " + tempNode.getStrName();
+        }
+        if (fromLeafNode) {
+            return strResult;
+        } else {
+            String newResult = "";
+            String[] history = strResult.split("\\;");
+            for (int i = history.length; i > 0; i--) {
+                newResult += history[i - 1] + "; ";
+            }
+            return newResult;
+        }
+    }
+
+    /**
+     * check the number of the arguments:
+     * java GetCluster arg1 arg2 ...
+     *
+     * @param length  the length of the arguments
+     * 				in this program, length should be 1
+     */
+    public void checkParameters(int length) {
+        if (length != 1) {
+            System.out.println("Usage: java GetCluster inputFileName");
+            System.exit(1);
+        }
+    }
+
+    /**
+     *
+     * @param inputFileName  the name of the input file name
+     * @return  an Instances of Weka Instances
+     */
+    public Instances input(String inputFileName) {
+        String[] inputName = inputFileName.split("\\.");
+        Instances oriData = null;
+        try {
+            if (inputName[inputName.length - 1].compareToIgnoreCase("csv") == 0) {
+//				 read from csv file
+                readCSV(inputFileName);
+                FileReader f = new FileReader(inputFileName + ".arff");
+                BufferedReader b = new BufferedReader(f);
+                oriData = new Instances(b);
+            } else if (inputName[inputName.length - 1].compareToIgnoreCase("arff") == 0) {
+                // read from arff data
+                FileReader f = new FileReader(inputFileName);
+                BufferedReader b = new BufferedReader(f);
+                oriData = new Instances(b);
+            } else {
+                System.out.println("only .arff or .csv format allowed!");
+                System.exit(1);
+            }
+        } catch (FileNotFoundException e) {
+            // TODO Auto-generated catch block
+            e.printStackTrace();
+        } catch (IOException e) {
+            // TODO Auto-generated catch block
+            e.printStackTrace();
+        }
+        return oriData;
+    }
+
+    /**
+     * read a csv file and convert to a arff file
+     * @param inputName  the name of the csv file
+     */
+    public void readCSV(String inputName) {
+
+        try {
+            FileReader fr = new FileReader(inputName);
+            BufferedReader br = new BufferedReader(fr);
+            FileOutputStream stream;// provides file access
+            OutputStreamWriter writer;// writes to the file
+            stream = new FileOutputStream(new File(inputName + ".arff"), false);
+            writer = new OutputStreamWriter(stream);
+
+            String strLine = br.readLine();
+            String[] varNameArray = strLine.split("\\,");
+
+            writer.write("@RELATION dataset" + "\n\n");
+            for (int i = 0; i < varNameArray.length; i++) {
+                if (i < 2) {
+                    writer.write("@ATTRIBUTE" + " " + "\"" + varNameArray[i] + "\"" + " " + "string" + "\n");
+                } else {
+                    writer.write("@ATTRIBUTE" + " " + "\"" + varNameArray[i] + "\"" + " " + "numeric" + "\n");
+                }
+            }
+            writer.write("\n@DATA\n");
+            while ((strLine = br.readLine()) != null) {
+                writer.write(strLine + "\n");
+            }
+
+            writer.close();
+            stream.close();
+            fr.close();
+            br.close();
+
+        } catch (FileNotFoundException e) {
+            // TODO Auto-generated catch block
+            e.printStackTrace();
+        } catch (IOException e) {
+            // TODO Auto-generated catch block
+            e.printStackTrace();
+        }
+
+    }
+
+    /**
+     *
+     * @param data
+     * @return  an array contains the first element
+     *          of each instance of input data
+     */
+    private String[] getClusterNames(Instances data) {
+        String[] result = new String[data.numInstances()];
+        for (int i = 0; i < result.length; i++) {
+            String[] strArray = data.instance(i).toString().split("\\,");
+            result[i] = strArray[0];
+        }
+        return result;
+    }
+
+    private double[] getGoID(String[] clusterNames, String lookupFile, String backgroundFile, String count) {
+        //********************************
+        // part 2, calculate RawGoID
+        //********************************
+        double[] result = new double[2];
+
+        //initialize local variables:
+        RawGoID myRawGoID = new RawGoID(clusterNames, lookupFile, backgroundFile);
+        double clusterGoid = myRawGoID.getRawGoID();
+        double randomAve = 0.0;
+        double randomStd = 0.0;
+        double zScore = 0.0;
+
+        //    System.out.println("real cluster raw GOid =" + clusterGoid);
+
+        // get 'repeat time' random rawGoIDs
+        double[] randomGoid = new double[Integer.parseInt(count)];
+        for (int i = 0; i < Integer.parseInt(count); i++) {
+            randomGoid[i] = myRawGoID.getRandomRawGoID();
+//            System.out.println("now is in loop :" + (i + 1));
+//            System.out.println("randomGOid = " + randomGoid[i]);
+        }
+
+        //calculate
+        randomAve = Stats.getMean(randomGoid);
+        randomStd = Stats.getStdDev(randomGoid);
+        zScore = Stats.getZscore(randomAve, randomStd, clusterGoid);
+        result[0] = clusterGoid;
+        result[1] = zScore;
+        return result;
+
+    }
+
+     private double[] getGoIDFromFunc(String[] clusterNames, String lookupFile, String backgroundFile, String count) {
+        //********************************
+        // part 2, calculate RawGoID
+        //********************************
+        double[] result = new double[2];
+
+        //initialize local variables:
+        RawGoID myRawGoID = new RawGoID(clusterNames, lookupFile, backgroundFile);
+        double clusterGoid = myRawGoID.getRawGoID();
+        double randomAve = 0.0;
+        double randomStd = 0.0;
+        double zScore = 0.0;
+
+        //    System.out.println("real cluster raw GOid =" + clusterGoid);
+
+        // get 'repeat time' random rawGoIDs
+        double[] randomGoid = new double[Integer.parseInt(count)];
+        for (int i = 0; i < Integer.parseInt(count); i++) {
+            randomGoid[i] = myRawGoID.getRandomRawGoID();
+//            System.out.println("now is in loop :" + (i + 1));
+//            System.out.println("randomGOid = " + randomGoid[i]);
+        }
+
+        //calculate
+        randomAve = Stats.getMeanFromFunc(myRawGoID.getOriClusterSize());
+        randomStd = Stats.getStdDevFromFunc(myRawGoID.getOriClusterSize());
+        zScore = Stats.getZscore(randomAve, randomStd, clusterGoid);
+        result[0] = clusterGoid;
+        result[1] = zScore;
+        return result;
+
+    }
+}//end of class
+
+
+
+final class TreeNode {
+
+    private String strName;
+    private double dLikelihood;
+    private Instances data;
+    private TreeNode parent;
+//	TreeNode child;
+
+    /**
+     * @param strName		name of node
+     * @param likelihood	likelihood of the data
+     * @param data			data set
+     * @param parent		point to its parent node
+     * @param child			point to its child node
+     */
+    public TreeNode(String strName, double likelihood, Instances data, TreeNode parent) {
+        this.strName = strName;
+        dLikelihood = likelihood;
+        this.data = data;
+        this.parent = parent;
+    }
+
+    /**
+     * @return the data
+     */
+    public Instances getData() {
+        return data;
+    }
+
+    /**
+     * @param data the data to set
+     */
+    public void setData(Instances data) {
+        this.data = data;
+    }
+
+    /**
+     * @return the dLikelihood
+     */
+    public double getDLikelihood() {
+        return dLikelihood;
+    }
+
+    /**
+     * @param likelihood the dLikelihood to set
+     */
+    public void setDLikelihood(double likelihood) {
+        dLikelihood = likelihood;
+    }
+
+    /**
+     * @return the parent
+     */
+    public TreeNode getParent() {
+        return parent;
+    }
+
+    /**
+     * @param parent the parent to set
+     */
+    public void setParent(TreeNode parent) {
+        this.parent = parent;
+    }
+
+    /**
+     * @return the strName
+     */
+    public String getStrName() {
+        return strName;
+    }
+
+    /**
+     * @param strName the strName to set
+     */
+    public void setStrName(String strName) {
+        this.strName = strName;
+    }
+}
--- a/qhtcp-workflow/apps/java/weka-clustering/src/Information.java
+++ b/qhtcp-workflow/apps/java/weka-clustering/src/Information.java
@@ -0,0 +1,165 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+
+import java.io.BufferedWriter;
+import java.io.FileWriter;
+import java.util.HashMap;
+import java.util.Iterator;
+
+/**
+ *
+ * @author DTian
+ */
+public class Information {
+
+    /**
+     *
+     * @param key  the key  of the dictionary
+     * @return the entropy
+     */
+    public static double entropy(String [] data ) {
+        double entropy = 0;
+
+        // Frequency table
+        HashMap freqDict = new HashMap();
+        int one = 1;
+        
+        for(int i=0; i<data.length; i++){
+            String newkey = data[i];
+            if (freqDict.containsKey(newkey)) {
+                int val = Integer.parseInt(freqDict.get(newkey).toString());
+                freqDict.remove(newkey);
+                val = val + 1;
+                freqDict.put(newkey, val + "");
+            } else {
+                freqDict.put(newkey, (one + ""));
+            }
+        }
+
+        // Probability table
+        HashMap probDict = new HashMap();
+        Iterator it = freqDict.keySet().iterator();
+        String newkey = "";
+        while (it.hasNext()) {
+            newkey = (String) it.next();
+            double value = 0.0;
+            value = Double.parseDouble((String) freqDict.get(newkey)) / data.length;
+            probDict.put(newkey, value + "");
+        }
+
+        // Calculate entropy
+        it = probDict.keySet().iterator();
+        while (it.hasNext()) {
+            newkey = (String) it.next();
+            double value = 0.0;
+            value = Double.parseDouble((String) probDict.get(newkey));
+            entropy = entropy - value * (Math.log(value) / Math.log(2));
+        }
+        return entropy;
+    }
+
+    public static double relativeEntropy(String[] data1, String[] data2) {
+
+        double result = 0;
+        // System.out.println(data1.length);
+
+        // Frequency table
+        HashMap freqDict1 = new HashMap();
+        int one = 1;
+        for(int i=0; i<data1.length; i++){
+            Object key = data1[i];
+            if(freqDict1.containsKey(key)){
+                int val = Integer.parseInt( freqDict1.get(key).toString());
+                //freqDict1.remove(key);
+                val++;
+                freqDict1.put(key, val + "");
+            } else {
+                freqDict1.put(key, (one + ""));
+            }
+        }
+
+        // toFileHM(freqDict1, "FreqDict1.txt");
+        HashMap freqDict2 = new HashMap();
+        for (int i=0; i<data2.length; i++) {
+            Object key = data2[i];
+            if (freqDict2.containsKey(key)) {
+                int val = Integer.parseInt(freqDict2.get(key).toString());
+                //freqDict2.remove(key);
+                val++;
+                freqDict2.put(key, val + "");
+            } else {
+                freqDict2.put(key, (one + ""));
+            }
+        }
+
+        // Probability table
+        HashMap<Object, Object> probDict1 = new HashMap<Object, Object>();
+        HashMap<Object, Object> probDict2 = new HashMap<Object, Object>();
+        Iterator it = freqDict1.keySet().iterator();
+        while (it.hasNext()) {
+        	Object newkey = it.next();
+            double value = 0;
+            value = Double.parseDouble((String) freqDict1.get(newkey)) / data1.length;
+            probDict1.put(newkey, value + "");
+        }
+        
+        it = freqDict2.keySet().iterator();
+        while (it.hasNext()) {
+        	Object newkey = it.next();
+            double value = 0;
+            value = Double.parseDouble((String) freqDict2.get(newkey)) / data2.length;
+            probDict2.put(newkey, value + "");
+        }
+
+        // Calculate the relative entropy
+        it = probDict1.keySet().iterator();        
+        while (it.hasNext()) {
+            Object newkey = it.next();
+            Object value1 = probDict1.get(newkey);
+            //Object value2 = probDict2.get(newkey);
+            double dValue1 = Double.parseDouble(probDict1.get(newkey).toString());
+            double dValue2 = Double.parseDouble(probDict2.get(newkey).toString());
+            if ( value1.toString().trim().compareToIgnoreCase("1.0") == 0) {
+                result = result +  dValue1 * (Math.log(dValue1/dValue2) / Math.log(2));
+            } else if (value1.toString().trim().compareToIgnoreCase("0") == 0){
+                result = result +  (1-dValue1) * (Math.log((1-dValue1)/(1-dValue2)) / Math.log(2));
+            } else {
+                result = result +  dValue1 * (Math.log(dValue1/dValue2) / Math.log(2));
+                result = result +  (1-dValue1) * (Math.log((1-dValue1)/(1-dValue2)) / Math.log(2));
+            }
+            // toFile(result+"", "probDict1.txt");            
+            // toFile(result, "resultsOfresult.txt");//check point by Jingyu            
+        }
+        //toFile(probDict1.size()+ "*******************", "probDict1.txt");
+        //System.out.println("relative entropy = " + result);
+        return result;
+    }
+    
+    private static void toFile(String data, String filename) {
+
+        // Output to file
+        try {
+            BufferedWriter writer = new BufferedWriter(new FileWriter(filename,true));
+            writer.write(data + "\n");
+            writer.close();
+        } catch (Exception e) {
+            System.err.println(e.getStackTrace());
+        }
+    }
+
+	private static void toFileHM(HashMap data, String filename) {
+
+        // Output to file
+        try {
+            BufferedWriter writer = new BufferedWriter(new FileWriter(filename, true));
+            for (Object key : data.keySet()) {
+                writer.write(key.toString() +":"+ data.get(key)+"\n");
+            }
+            writer.close();
+        } catch (Exception e) {
+            System.err.println(e.getStackTrace());
+        }
+    }
+}
--- a/qhtcp-workflow/apps/java/weka-clustering/src/Matrix.java
+++ b/qhtcp-workflow/apps/java/weka-clustering/src/Matrix.java
@@ -0,0 +1,130 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Iterator;
+
+/**
+ *
+ * @author DTian
+ */
+public class Matrix {
+
+    private HashMap matrix; // store data
+    private int rowSize; // row size of matrix
+    private int colSize; // column size of value array
+    private final int lookupTableSize = 9000; //size of look up table
+
+    public int getColSize() {
+        return colSize;
+    }
+
+    public void setColSize(int colSize) {
+        this.colSize = colSize;
+    }
+
+    public HashMap getMatrix() {
+        return matrix;
+    }
+
+    public void setMatrix(HashMap matrix) {
+        this.matrix = matrix;
+    }
+
+    public int getRowSize() {
+        return rowSize;
+    }
+
+    public void setRowSize(int rowSize) {
+        this.rowSize = rowSize;
+    }
+
+    public Matrix() {
+        rowSize = 0;
+        colSize = 0;
+        matrix = new HashMap();
+    }
+
+    /**
+     * constructor with 1 String parameter
+     *
+     * @param filename : the name of the input file
+     *
+     * @result: create a matrix from a input file
+     *
+     */
+    public Matrix(String filename) {
+
+        // Initialize variables
+        this.setRowSize(0);
+        this.setColSize(0);
+        matrix = new HashMap(lookupTableSize);
+
+        try {
+
+            FileReader fr = new FileReader(filename);
+            BufferedReader br = new BufferedReader(fr);
+
+            // strRow is used to read line from file(skip first row)
+            String strRow = br.readLine();
+
+            // The while loop read the data from data file to vvf
+            while ((strRow = br.readLine()) != null) {
+
+                // strArray was used to store the float value from data file in
+                // string format
+                String delimiter = "";
+                if (strRow.indexOf(",") >= 0) { //for CSV file
+                    delimiter = "\\,";
+                } else { // for whitespace delimited file
+                    delimiter = "\\s";
+                }
+
+                String[] strArray = strRow.trim().split(delimiter);
+                String[] strArrValue = Arrays.copyOfRange(strArray, 1, strArray.length);
+                // strArray[0] is the orf name, others are value
+                matrix.put(strArray[0].trim().toLowerCase(), strArrValue);
+                rowSize++;
+                colSize = strArrValue.length;
+            }
+
+            br.close();
+            fr.close();
+        } catch (IOException e) {
+            // catch possible io errors from readLine()
+            System.out.println("IOException error in  'class Matrix, constructor'");
+        }
+    }
+
+    /**
+     *
+     * @param index, the specifed key
+     * @return:  the string array of the value
+     */
+    public String[] getSpecifiedValue(Object key) {
+        return (String[]) matrix.get(key);
+    }
+
+    /**
+     * @return the list of orf names
+     */
+    public ArrayList getOrfNames() {
+        ArrayList result = new ArrayList(this.getRowSize());
+        Iterator it = matrix.keySet().iterator();
+        while (it.hasNext()) {
+            result.add(it.next());
+        }
+        return result;
+    }
+
+    public void addValue(Object key, Object value) {
+        matrix.put(key, value);
+    }
+}
--- a/qhtcp-workflow/apps/java/weka-clustering/src/RawGoID.java
+++ b/qhtcp-workflow/apps/java/weka-clustering/src/RawGoID.java
@@ -0,0 +1,375 @@
+/*
+ * the input: 3 files, 1 is cluster file, 2 is Go matrix file (lookup table)
+ *   3 is back ground file(pool)
+ *
+ *
+ */
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Random;
+
+/**
+ *
+ * @author DTian
+ */
+public class RawGoID {
+
+    private ArrayList clusterGeneList;  // for the input cluster file
+    private Matrix poolTable;  //for the filtered gene pool list
+    private Matrix lookupTable; // for the lookup attribute table
+    private int oriClusterSize; //for the original cluster size
+    private ArrayList oriPoolOrfsName;//for the complete list of pool table
+    // private String randomFilename;
+
+    public Matrix getLookupTable() {
+        return lookupTable;
+    }
+
+    public void setLookupTable(Matrix lookupTable) {
+        this.lookupTable = lookupTable;
+    }
+
+    public Matrix getPoolTable() {
+        return poolTable;
+    }
+
+    public void setPoolTable(Matrix poolTable) {
+        this.poolTable = poolTable;
+    }
+
+    public ArrayList getClusterGeneList() {
+        return clusterGeneList;
+    }
+
+    public void setClusterGeneList(ArrayList clusterGeneList) {
+        this.clusterGeneList = clusterGeneList;
+    }
+
+    public RawGoID() {
+        clusterGeneList = new ArrayList();
+        poolTable = new Matrix();
+        lookupTable = new Matrix();
+        // randomFilename ="";
+    }
+    
+    public void setOriClusterSize(int oriClusterSize) {
+    	this.oriClusterSize = oriClusterSize;
+    }
+
+    public int getOriClusterSize () {
+    	return oriClusterSize;
+    }
+    
+    public void setOriPoolOrfsName(ArrayList oriPoolOrfsName) {
+    	this.oriPoolOrfsName = oriPoolOrfsName;
+    }
+
+    public ArrayList getOriPoolOrfsName() {
+    	return oriPoolOrfsName;
+    }
+    	
+    /**
+     * 
+     * @param clusterFilename : cluster Filename
+     * @param GoMatrixFilename : GoMatrix Filename
+     * @param backGroundFilename : backGround Filename
+     */
+    public RawGoID(String clusterFilename, String GoMatrixFilename, String backGroundFilename) {
+        try {
+            clusterGeneList = new ArrayList(200);
+            ArrayList refClusterGeneList = new ArrayList (200);
+
+            // Get the smallGeneList (a cluster )
+            BufferedReader br = new BufferedReader(new FileReader(clusterFilename));
+
+            // strRow is used to read line from file
+            String strRow = "";
+            while ((strRow = br.readLine()) != null) {
+                clusterGeneList.add(strRow.trim().toLowerCase());
+            }
+            // System.out.println(clusterGeneList.size());    
+            setOriClusterSize(clusterGeneList.size());
+            // System.out.println("original cluster size =" + clusterGeneList.size());
+            
+            // Get the mtrix (lookup table)
+            lookupTable = new Matrix(GoMatrixFilename);
+
+            // Get the bigGeneList (pool or back ground file)
+            br = new BufferedReader(new FileReader(backGroundFilename));
+
+            ArrayList poolOrfsName = new ArrayList(5000);
+            while ((strRow = br.readLine()) != null) {
+                poolOrfsName.add(strRow.trim().toLowerCase());
+            }
+            this.setOriPoolOrfsName(poolOrfsName);
+            poolTable = new Matrix();
+            for (int i = 0; i < poolOrfsName.size(); i++) {
+                Object tempKey = poolOrfsName.get(i);
+                if (lookupTable.getMatrix().containsKey(tempKey)) {
+                    poolTable.addValue(tempKey, lookupTable.getMatrix().get(tempKey));
+                }
+            }
+            poolTable.setRowSize(poolTable.getMatrix().size());
+            poolTable.setColSize(lookupTable.getColSize());
+            br.close();
+            
+            // This loop is to take out any ORF from the cluster gene list if not exist in pool table
+            // not necessary if all cluster ORFs are from pool table
+            for (int i=0;i<refClusterGeneList.size();i++){
+            	Object tempKey = clusterGeneList.get(i);
+            	if (!poolTable.getMatrix().containsKey(tempKey)){
+            		clusterGeneList.remove(i);
+            	}
+            }
+
+            // System.out.println("length of real cluster gene List after filtering = " + clusterGeneList.size());
+            // Check point
+            // System.out.println(clusterGeneList);
+        } catch (IOException e) {
+            // Catch possible io errors from readLine()
+            System.out.println("IOException error in  'class GetGoID, constructor'");
+        }
+
+        // Checkpoint
+        // System.out.println("Column size of pooltable is:"+ poolTable.getColSize() +'\t'+ "Row size of pooltable is :"+ poolTable.getRowSize());
+        // randomFilename = "randomOrfName.txt";
+    }
+
+    /**
+     *
+     * @param clusterFilename : cluster Filename
+     * @param GoMatrixFilename : GoMatrix Filename
+     * @param backGroundFilename : backGround Filename
+     */
+    public RawGoID(String[] clusterName, String GoMatrixFilename, String backGroundFilename) {
+        try {
+            clusterGeneList = new ArrayList(clusterName.length);
+            ArrayList refClusterGeneList = new ArrayList (200);
+
+            // Get the smallGeneList (a cluster )
+            for(String name: clusterName){
+                clusterGeneList.add(name.trim().toLowerCase());
+            }
+
+            // System.out.println(clusterGeneList.size());
+            setOriClusterSize(clusterGeneList.size());
+            // System.out.println("original cluster size =" + clusterGeneList.size());
+
+            // Get the mtrix (lookup table)
+            lookupTable = new Matrix(GoMatrixFilename);
+
+            // Get the bigGeneList (pool or back ground file)
+            BufferedReader br = new BufferedReader(new FileReader(backGroundFilename));
+            ArrayList poolOrfsName = new ArrayList(5000);
+            String strRow = "";
+            while ((strRow = br.readLine()) != null) {
+                poolOrfsName.add(strRow.trim().toLowerCase());
+            }
+            this.setOriPoolOrfsName(poolOrfsName);
+            poolTable = new Matrix();
+            for (int i = 0; i < poolOrfsName.size(); i++) {
+                Object tempKey = poolOrfsName.get(i);
+                if(lookupTable.getMatrix().containsKey(tempKey)){
+                    poolTable.addValue(tempKey, lookupTable.getMatrix().get(tempKey));
+                }
+            }
+            poolTable.setRowSize(poolTable.getMatrix().size());
+            poolTable.setColSize(lookupTable.getColSize());
+            br.close();
+
+            // This loop is to take out any ORF from the cluster gene list if not exist in pool table
+            // not necessary if all cluster ORFs are from pool table
+            for (int i=0;i<refClusterGeneList.size();i++){
+            	Object tempKey = clusterGeneList.get(i);
+            	if (!poolTable.getMatrix().containsKey(tempKey)){
+            		clusterGeneList.remove(i);
+            	}
+            }
+
+            // System.out.println("length of real cluster gene List after filtering = " + clusterGeneList.size());
+            // Checkpoint
+            // System.out.println(clusterGeneList);
+        } catch (IOException e) {
+            // Catch possible io errors from readLine()
+            System.out.println("IOException error in  'class GetGoID, constructor'");
+        }
+
+        // Checkpoint
+        // System.out.println("Column size of pooltable is:"+ poolTable.getColSize() +'\t'+ "Row size of pooltable is :"+ poolTable.getRowSize());
+        // randomFilename = "randomOrfName.txt";
+    }
+
+    public double getRawGoID() {
+        double result = 0.0;
+        ArrayList fullMatrix = new ArrayList(this.getPoolTable().getRowSize());
+        ArrayList subMatrix = new ArrayList(this.getClusterGeneList().size());
+        
+        // Fill the fullMatrix with pool table data
+        Iterator it = this.getPoolTable().getMatrix().keySet().iterator();
+        while (it.hasNext()) {
+            Object key = it.next();
+            fullMatrix.add(this.getPoolTable().getMatrix().get(key.toString().toLowerCase()));
+        }
+        // System.out.println("size of fullMatrix is:"+ fullMatrix.size());
+
+        // Fill the subMatrix with lookup table data and cluster information
+        for (Object element : this.getClusterGeneList()) {
+            if (this.getLookupTable().getMatrix().containsKey(element)) {
+                subMatrix.add(this.getLookupTable().getMatrix().get(element.toString().toLowerCase()));
+            }
+        }
+        // System.out.println("size of subMatrix is:"+ subMatrix.size());
+        
+        // Transpose the 2 matrix
+        ArrayList attrByOrfFullMatrix = this.transpose(fullMatrix);
+        ArrayList attrByOrfSubMatrix = this.transpose(subMatrix);
+        
+        // System.out.println("size of transposed fullMatrix is:"+ attrByOrfFullMatrix.size());
+        // System.out.println("size of transposed subMatrix is:"+ attrByOrfSubMatrix.size());
+        
+        // Calculate the raw GoID
+        for (int i = 0; i < attrByOrfFullMatrix.size(); i++) {
+            // Added by tdh, from the source code, we need not do this step
+            int nonZeroCount = 0;
+            String[] tempArray = (String[]) attrByOrfFullMatrix.get(i);
+            for (int j = 0; j < tempArray.length; j++) {
+            	// System.out.println(Integer.parseInt(tempArray[j].trim()));
+                if (tempArray[j].trim().compareToIgnoreCase("1")==0 ) {
+                	// System.out.println(Integer.parseInt(tempArray[j].trim()));
+                	// System.out.println(tempArray[j].trim().compareToIgnoreCase("1"));//Jingyu added for checking
+                    nonZeroCount++;
+                    break; // added by tdh, from the source code, we need not do this step//Jingyu notes: the break may help the code running faster.
+                    // System.out.println("nonzeroCount =" + nonZeroCount);//Jingyu add
+                }
+            }
+            // System.out.println("nonzeroCount =" + nonZeroCount);//Jingyu add
+            if (nonZeroCount >= 0) {
+                result = result + Information.relativeEntropy(
+                    ((String[]) attrByOrfSubMatrix.get(i)),
+                    (String[]) (attrByOrfFullMatrix.get(i)));
+            }
+            // System.out.println(Information.relativeEntropy(
+            // ((String[]) attrByOrfSubMatrix.get(i)), (String[]) (attrByOrfFullMatrix.get(i))));
+        }
+        // System.out.println("result =" + result);
+        return result;
+    }
+
+    private ArrayList transpose(ArrayList data) {
+        ArrayList result = new ArrayList(data.size());
+        // Do transpose here
+        int rowSize = data.size();
+        int colSize = ((String[]) data.get(0)).length;
+
+        String[][] matrix = new String[colSize][rowSize];
+        for (int i = 0; i < rowSize; i++) {
+            String[] temp = (String[]) data.get(i);
+            for (int j = 0; j < colSize; j++) {
+                // System.out.println("j is : " + j);
+                matrix[j][i] = temp[j];
+            }
+        }
+
+        // Convert to ArrayList
+        for (int i = 0; i < matrix.length; i++) {
+            result.add(matrix[i]);
+        }
+        return result;
+    }
+
+    public double getRandomRawGoID() {
+        double result = 0.0;
+        this.setClusterGeneList(this.getRandomCluster(this.getOriClusterSize()));
+        result = this.getRawGoID();
+        if (Double.isNaN(result)) {
+            return getRandomRawGoID();
+        } else {
+            return result;
+        }
+    }
+
+    private void toFile(HashMap data, String filename) {
+
+        // Output to a file
+        try {
+            BufferedWriter writer = new BufferedWriter(new FileWriter(filename));
+            for (Object key : data.keySet()) {
+                writer.write(key.toString() + "\n");
+            }
+            writer.close();
+        } catch (Exception e) {
+            System.err.println(e.getStackTrace());
+        }
+
+    }
+    private static void toFileString(String data, String filename) {
+
+        // Output to a file
+        try {
+            BufferedWriter writer = new BufferedWriter(new FileWriter(filename,true));
+            writer.write(data + "\n");
+            writer.close();
+        } catch (Exception e) {
+            System.err.println(e.getStackTrace());
+        }
+    }
+
+    private ArrayList getRandomCluster(int clusterSize) {
+        ArrayList<String> result = new ArrayList(clusterSize);
+        
+        // Jingyu: The following segment of code, which is deactivated, is designed to get the random cluster list from a lookuptable-filtered pooltable
+        // Jingyu: To do so may cause a bias in average of random raw GOid score by using a smaller pool list
+        // get a random cluster with same size of the cluster file and then calculate the Goid
+        // 1, get the random orf names to a ArrayList
+
+        // HashMap hm = new HashMap(this.getClusterGeneList().size());
+        // while (hm.keySet().size() < clusterSize) {
+        //     hm.put(this.getPoolTable().getOrfNames().get(randInt(this.getPoolTable().getOrfNames().size())), "0");
+        // }
+        // result.addAll(hm.keySet());
+
+        // Get a random cluster with same size of the cluster file from the original ORF pool
+        // Extra step added by Jingyu to remove the ORFs not existing in pooltable;
+        ArrayList localOriPoolTable = new ArrayList();
+        localOriPoolTable = this.getOriPoolOrfsName();
+
+        // Checkpoint
+        // System.out.println(localOriPoolTable.size());
+        for (int i=0;i<clusterSize;i++){
+        	result.add(localOriPoolTable.get(randInt(localOriPoolTable.size())).toString().trim().toLowerCase());       	
+        }    
+        return result;
+    }
+
+    /**
+     *
+     * @param max the max integer you want to generate
+     *
+     * @return : a random integar between 0 and max
+     */
+    private int randInt(int max) {
+        Random r = new Random((int) (System.nanoTime()));
+        int random = r.nextInt();
+        random = Math.abs(random);
+        random = random % max;
+        // random += 1;
+        return random;
+    }
+
+    // This method is not used for the final code.
+    // private String [] getZeroStringArray(int length) {
+    //     String [] tmpStrArray = new String[length];
+    //     for (int j=0; j<tmpStrArray.length; j++) {
+    //         tmpStrArray[j] = "0";
+    //     }
+    //     return tmpStrArray;
+    // }
+}
+
--- a/qhtcp-workflow/apps/java/weka-clustering/src/SGD2AttrTable.java
+++ b/qhtcp-workflow/apps/java/weka-clustering/src/SGD2AttrTable.java
@@ -0,0 +1,155 @@
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.util.HashMap;
+import java.util.Iterator;
+
+
+/*
+ * This program starts by creating an intermediate table and then will load the function from Dr. Brett McKinney to create the attribute table.
+ */
+
+/**
+ *
+ * @author DTian
+ */
+public class SGD2AttrTable {
+
+    public void createIntermediateTable(String inputFile, String outputFile) {
+        HashMap geneToGODict  = new HashMap();
+         try {
+
+            FileReader fr = new FileReader(inputFile);
+            BufferedReader br = new BufferedReader(fr);
+
+            // strRow is used to read line from file(skip first row)
+            String strRow = br.readLine();
+
+            // The while loop read the data from data file to vvf
+            while ((strRow = br.readLine()) != null) {
+
+                // Check: skip the line if it is a comment line
+                if (strRow.trim().charAt(0) != 'S' ) {
+                    continue;
+                }
+                String [] strArray = strRow.trim().split("\\t");
+                String key = toKey(strArray[10].toUpperCase());
+                if (key.compareToIgnoreCase("") == 0) {
+                    continue;
+                }
+                String value = toValue(strArray[4]);
+                if (geneToGODict.containsKey(key)) {
+                    geneToGODict.put(key, geneToGODict.get(key)+ "\t" + value);
+                } else {
+                    geneToGODict.put(key, value);
+                }
+            }
+
+            br.close();
+            fr.close();
+
+            // Write to output file
+            FileOutputStream stream; // provides file access
+            OutputStreamWriter writer; // writes to the file
+            stream = new FileOutputStream(new File(outputFile), true);
+            writer = new OutputStreamWriter(stream);
+            Iterator it = geneToGODict.keySet().iterator();
+            while(it.hasNext()){
+                String key = it.next().toString();
+                String value = geneToGODict.get(key).toString();
+                writer.write(key + "\t" + value + "\n");
+            }
+            writer.flush();
+            writer.close();
+            stream.close();
+        } catch (IOException e) {
+            // Catch possible io errors from readLine()
+            System.out.println("IOException error in  'class SGD2AttrTable, method createIntermediateTable'");
+        }
+    }
+
+    public void createAttrTable(String intermediaFile, String outputFile){
+        HashMap geneToGODict  = new HashMap();
+         try {
+
+            FileReader fr = new FileReader(intermediaFile);
+            BufferedReader br = new BufferedReader(fr);
+
+            // strRow is used to read line from file(skip first row)
+            String strRow = br.readLine();
+
+            // The while loop read the data from data file to vvf
+            while ((strRow = br.readLine()) != null) {
+
+                //check: skip the line if it is a comment line
+                if (strRow.trim().charAt(0) != 'S' ) {
+                    continue;
+                }
+                String [] strArray = strRow.trim().split("\\t");
+                String key = toKey(strArray[10].toUpperCase());
+                if (key.compareToIgnoreCase("") == 0) {
+                    continue;
+                }
+                String value = toValue(strArray[4]);
+                if (geneToGODict.containsKey(key)) {
+                    geneToGODict.put(key, geneToGODict.get(key)+ "\t" + value);
+                } else {
+                    geneToGODict.put(key, value);
+                }
+            }
+
+            br.close();
+            fr.close();
+
+            // Write to output file
+            FileOutputStream stream; // provides file access
+            OutputStreamWriter writer; // writes to the file
+            stream = new FileOutputStream(new File(outputFile), true);
+            writer = new OutputStreamWriter(stream);
+            Iterator it = geneToGODict.keySet().iterator();
+            while (it.hasNext()) {
+                String key = it.next().toString();
+                String value = geneToGODict.get(key).toString();
+                writer.write(key + "\t" + value + "\n");
+            }
+            writer.flush();
+            writer.close();
+            stream.close();
+        } catch (IOException e) {
+            // Catch possible io errors from readLine()
+            System.out.println("IOException error in  'class SGD2AttrTable, method createIntermediateTable'");
+        }
+    }
+
+    /**
+     * 
+     * @param raw  the string need to be get rid of the "GO:0s"
+     * @return the string without "GO:00"
+     */
+    private String toValue(String raw) {
+        String result = raw.toUpperCase(); //raw should be like: "GO:0005739"
+        // Delete "GO:"
+        result = result.substring(3);
+        // Delete "lead zeros"
+        while (result.charAt(0) == '0') {
+            result =result.substring(1);
+        }
+        return result;
+    }
+
+    private String toKey(String raw) {
+        String result = raw.toUpperCase(); // raw should be like: "GO:0005739"
+        // Find the '|'
+        int end = result.indexOf('|');
+        // Get the sub string
+        if (end < 0) {
+            return result;
+        } else {
+            return result.substring(0, end);
+        }
+    }
+}
--- a/qhtcp-workflow/apps/java/weka-clustering/src/Stats.java
+++ b/qhtcp-workflow/apps/java/weka-clustering/src/Stats.java
@@ -0,0 +1,75 @@
+/*
+
+ *     def stats(self,r):
+        #returns the average, standard deviation, and min of a sequence
+        tot = sum(r)
+        ave = tot/len(r)
+        sdsq = sum([(i-ave)**2 for i in r])
+        s = list(r)
+        s.sort()
+        #median = s[len(s)//2]
+        return ave, (sdsq/(len(r)-1 or 1))**.5
+
+    def zscore(self,pop_mean,pop_std,raw_goid):
+        return (raw_goid - pop_mean)/pop_std
+ */
+
+/**
+ *
+ * @author DTian
+ */
+public class Stats {
+
+    /**
+     *
+     * @param data the double array
+     * @return the stand deviation of the array
+     */
+    public static double getStdDev(double[] data) {
+        double result = 0.0;
+        double ave = getMean(data);
+        for (double d : data) {
+            result += Math.pow((d-ave), 2);
+        }
+        if (data.length>1) {
+        	return Math.sqrt(result/(data.length-1));
+        } else {
+        	return Math.sqrt(result/1);
+        }
+    }
+
+    /**
+     *
+     * @param data the double array
+     * @return the mean of the double array.
+     */
+    public static double getMean(double[] data) {
+        double result = 0.0;
+        for (double d : data) {
+            result += d;
+        }
+        return (result/data.length);
+    }
+
+    /**
+     *
+     * @param size  the size of ori cluster File
+     * @return the mean of the double array.
+     */
+    public static double getMeanFromFunc(int size) {
+        return ( -4.8616 + 71.1806/Math.pow(size, 0.33511));
+    }
+
+    /**
+     *
+     * @param size  the size of ori cluster File
+     * @return the mean of the double array.
+     */
+    public static double getStdDevFromFunc(int size) {
+        return ( -0.04943 + 56.634/Math.pow(size, 0.89384));
+    }
+
+    public static double getZscore(double popMean, double popStd, double rawGoid) {
+      return (rawGoid - popMean)/popStd;
+    }
+}