bryan
/
server-scripts


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532
							
/**
 * This program will take an input file(either in arff format or csv format).
 * outout 3 files: one is the tree structure. another is the final table with
 * all information, the last one is the summary information
 * 
 */
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.Vector;

import weka.clusterers.ClusterEvaluation;
import weka.clusterers.EM;
import weka.core.Instances;

public class GetClusters {

    public GetClusters() {
    }

    /**
     *
     * @param root 	 the tree node we need to cluster
     * @param generation	 the depth of the tree
     * @param position		 the breadth of the tree
     * @param vecFinalTable	 contain the final table
     */
    public int clustering(TreeNode root, int generation,
            String position, Vector vecFinalTable,
            Vector vecSummary, String outputFilename, OutputStreamWriter xmlWriter,
            String lookupFile, String backgroundFile, String count, boolean fromFunction) {
        int result = 0;
        try {

            FileOutputStream stream;// provides file access
            OutputStreamWriter writer;// writes to the file
            stream = new FileOutputStream(new File(outputFilename), true);
            writer = new OutputStreamWriter(stream);

            // ***** 1 create a copy of original data *****
            Instances oriData = root.getData();
            Instances data = new Instances(oriData);

            // ***** 2 remove attribute: orf_name(string attribute) *****
//			data.deleteAttributeAt(0);
            data.deleteStringAttributes();

            // ***** 3 clustering *****
            EM clusterer = new EM(); // new instance of clusterer
            clusterer.buildClusterer(data); // build the clusterer

            // evaluate cluster
            ClusterEvaluation eval = new ClusterEvaluation();
            eval.setClusterer(clusterer); // the cluster to evaluate
            eval.evaluateClusterer(data); // data to evaluate the clusterer on

            //get the rawGoID and zScore for the
            //to be continued. AAA
            String[] clusterNames = getClusterNames(oriData);
            double[] goID = null;
            if(fromFunction){
                goID = this.getGoID(clusterNames, lookupFile, backgroundFile,count);
            }else{
                goID = this.getGoIDFromFunc(clusterNames, lookupFile, backgroundFile,count);
            }

            double logLikelihood = eval.getLogLikelihood();
            writer.write("logLikelihood is: " + logLikelihood + "\n");
            writer.write("GoID is: " + goID[0] + "\n");
            writer.write("zScore is: " + goID[1] + "\n\n");
            writer.flush();

            // ***** 4 get the sub clusters *****
            int numberOfSubCluster = eval.getNumClusters();
            if (numberOfSubCluster > 1) {// not an end node

                // create numberOfSubCluster instances array to store sub
                // clusters
                Instances[] subData = new Instances[numberOfSubCluster];
                TreeNode[] subNode = new TreeNode[numberOfSubCluster];
                for (int i = 0; i < numberOfSubCluster; i++) {
                    subData[i] = new Instances(oriData);
                    subData[i].delete();// keep only data head(attributes part)
                }
                // //System.out.println("\nlength is: " + data.numInstances());
                // //System.out.println("number of clusters: " +
                // numberOfSubCluster);

                // //System.out.println(eval.clusterResultsToString());
                double[] dArray = eval.getClusterAssignments();
                for (int i = 0; i < dArray.length; i++) {
                    int clusterNumber = (int) dArray[i];
                    // //System.out.println("\ngene " + i + " is in cluster: "
                    // + clusterNumber + ",\tlog likelihood is:"
                    // + eval.getLogLikelihood());
                    // //System.out.println("***************");

                    // assign each gene to according cluster
                    for (int j = 0; j < subData.length; j++) {
                        if (j == clusterNumber) {
                            subData[j].add(oriData.instance(i));
                        }
                    }// end of inner j loop
                }// end of outter i loop


                // ***** 5 recursive call *****
                String uniName = "";
//				for (int i = 0; i <= generation; i++) {
//					uniName += "0";
//				}
                uniName += generation + "-" + position;
                generation++;
                for (int i = 0; i < numberOfSubCluster; i++) {
                    String name = uniName + "-" + i;
                    //System.out.println("\n******************************");
                    //System.out.println("cluster name: " + name);
                    writer.write("\n******************************\n");
                    writer.write("cluster name: " + name + "\n");
                    writer.flush();
                    xmlWriter.write(" <branch>\n  <attribute name=\"name\" value=\"" + name + "\"/>\n");
                    xmlWriter.flush();

                    subNode[i] = new TreeNode(name, eval.getLogLikelihood(),
                            subData[i], root);
                    result += clustering(subNode[i], generation,
                            position + "." + i, vecFinalTable, vecSummary, outputFilename,
                            xmlWriter,lookupFile,backgroundFile,count,fromFunction);
                    xmlWriter.write(" </branch>\n");
                    xmlWriter.flush();
                }// end of for loop
            } else { //for leaf node
                //System.out.println("leaf node");
                result = 1;
                int temp = 1;
                if (!vecSummary.isEmpty()) {
                    String strT = (vecSummary.lastElement().toString()).split(",")[1];
                    temp = Integer.parseInt(strT.trim()) + 1;
                }
                writer.write("leaf node\n");
                writer.flush();

                for (int i = 0; i < root.getData().numInstances(); i++) {
                    String strTemp = eval.getLogLikelihood() + "," + root.getData().instance(i) + "," + getAncestor(root, false) + "," + temp;
                    //System.out.println( strTemp);
                    writer.write(strTemp + "\n");
                    writer.flush();
                    xmlWriter.write("<leaf>\n <attribute name=\"name\" value=\"" + root.getData().instance(i).stringValue(0) + "\"/>\n</leaf>\n");
                    xmlWriter.flush();
                    vecFinalTable.addElement(strTemp);
                }

                vecSummary.addElement(getAncestor(root, false).toString() + "," + temp + "," + root.getData().numInstances() + "," + logLikelihood);
                //System.out.println("******************************\n");
                writer.write("******************************\n");
                writer.flush();
                generation--;
            }//end of else


            writer.close();
            stream.close();

        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        return result;
    }//end of method "clustering"

    /**
     * output the root cluster name to file
     * @param fileName  output file name
     * @param rootName	thr root cluster name
     */
    public void printRootName(String fileName, String rootName) {
        try {
            FileOutputStream stream;// provides file access
            OutputStreamWriter writer;// writes to the file
            stream = new FileOutputStream(new File(fileName), true);
            writer = new OutputStreamWriter(stream);
            writer.write("root cluster is:" + rootName + "\n");
            writer.flush();
            writer.close();
            stream.close();
        } catch (FileNotFoundException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

    /**
     * print out the instance part of the data into a CSV formated table.
     *
     * @param data: the printed data set
     */
    public String printTableHead(Instances data) {
        String strResult = "likelihood";
        for (int i = 0; i < data.numAttributes(); i++) {
            String strTemp = "";
            String[] strArr = data.attribute(i).toString().split("\\ ");
            for (int j = 1; j < strArr.length - 1; j++) {
                strTemp += strArr[j];
            }
            strResult += "," + strTemp;
        }

        return strResult + ",cluster origin,cluster ID";
    }//end of method "printTalbe"

    /**
     * print the vector
     * @param vec
     */
    public void printVector(Vector vec, String outputFilename) {
        //System.out.println("\n***************************");
        //System.out.println("***   final    result   ***");
        //System.out.println("***************************");

        try {
            FileOutputStream stream;// provides file access
            OutputStreamWriter writer;// writes to the file
            stream = new FileOutputStream(new File(outputFilename), false);
            writer = new OutputStreamWriter(stream);

            for (int i = 0; i < vec.size(); i++) {
                //System.out.println(vec.elementAt(i));
                writer.write(vec.elementAt(i).toString() + "\n");
            }

            writer.close();
            stream.close();
        } catch (FileNotFoundException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

        //System.out.println("\n***************************");
        //System.out.println("*** end of final result ***");
        //System.out.println("***************************");
    }

    /**
     *
     * @param endNode  an leaf node
     * @return a string contains all the ancestor's name of the node
     */
    public String getAncestor(TreeNode endNode, boolean fromLeafNode) {
        String strResult = endNode.getStrName();
        TreeNode tempNode = endNode;
        while (tempNode.getParent() != null) {
            tempNode = tempNode.getParent();
            strResult += "; " + tempNode.getStrName();
        }
        if (fromLeafNode) {
            return strResult;
        } else {
            String newResult = "";
            String[] history = strResult.split("\\;");
            for (int i = history.length; i > 0; i--) {
                newResult += history[i - 1] + "; ";
            }
            return newResult;
        }
    }

    /**
     * check the number of the arguments:
     * java GetCluster arg1 arg2 ...
     *
     * @param length  the length of the arguments
     * 				in this program, length should be 1
     */
    public void checkParameters(int length) {
        if (length != 1) {
            System.out.println("Usage: java GetCluster inputFileName");
            System.exit(1);
        }
    }

    /**
     *
     * @param inputFileName  the name of the input file name
     * @return  an Instances of Weka Instances
     */
    public Instances input(String inputFileName) {
        String[] inputName = inputFileName.split("\\.");
        Instances oriData = null;
        try {
            if (inputName[inputName.length - 1].compareToIgnoreCase("csv") == 0) {
//				 read from csv file
                readCSV(inputFileName);
                FileReader f = new FileReader(inputFileName + ".arff");
                BufferedReader b = new BufferedReader(f);
                oriData = new Instances(b);
            } else if (inputName[inputName.length - 1].compareToIgnoreCase("arff") == 0) {
                // read from arff data
                FileReader f = new FileReader(inputFileName);
                BufferedReader b = new BufferedReader(f);
                oriData = new Instances(b);
            } else {
                System.out.println("only .arff or .csv format allowed!");
                System.exit(1);
            }
        } catch (FileNotFoundException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        return oriData;
    }

    /**
     * read a csv file and convert to a arff file
     * @param inputName  the name of the csv file
     */
    public void readCSV(String inputName) {

        try {
            FileReader fr = new FileReader(inputName);
            BufferedReader br = new BufferedReader(fr);
            FileOutputStream stream;// provides file access
            OutputStreamWriter writer;// writes to the file
            stream = new FileOutputStream(new File(inputName + ".arff"), false);
            writer = new OutputStreamWriter(stream);

            String strLine = br.readLine();
            String[] varNameArray = strLine.split("\\,");

            writer.write("@RELATION dataset" + "\n\n");
            for (int i = 0; i < varNameArray.length; i++) {
                if (i < 2) {
                    writer.write("@ATTRIBUTE" + " " + "\"" + varNameArray[i] + "\"" + " " + "string" + "\n");
                } else {
                    writer.write("@ATTRIBUTE" + " " + "\"" + varNameArray[i] + "\"" + " " + "numeric" + "\n");
                }
            }
            writer.write("\n@DATA\n");
            while ((strLine = br.readLine()) != null) {
                writer.write(strLine + "\n");
            }

            writer.close();
            stream.close();
            fr.close();
            br.close();

        } catch (FileNotFoundException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

    }

    /**
     *
     * @param data
     * @return  an array contains the first element
     *          of each instance of input data
     */
    private String[] getClusterNames(Instances data) {
        String[] result = new String[data.numInstances()];
        for (int i = 0; i < result.length; i++) {
            String[] strArray = data.instance(i).toString().split("\\,");
            result[i] = strArray[0];
        }
        return result;
    }

    private double[] getGoID(String[] clusterNames, String lookupFile, String backgroundFile, String count) {
        //********************************
        // part 2, calculate RawGoID
        //********************************
        double[] result = new double[2];

        //initialize local variables:
        RawGoID myRawGoID = new RawGoID(clusterNames, lookupFile, backgroundFile);
        double clusterGoid = myRawGoID.getRawGoID();
        double randomAve = 0.0;
        double randomStd = 0.0;
        double zScore = 0.0;

        //    System.out.println("real cluster raw GOid =" + clusterGoid);

        // get 'repeat time' random rawGoIDs
        double[] randomGoid = new double[Integer.parseInt(count)];
        for (int i = 0; i < Integer.parseInt(count); i++) {
            randomGoid[i] = myRawGoID.getRandomRawGoID();
//            System.out.println("now is in loop :" + (i + 1));
//            System.out.println("randomGOid = " + randomGoid[i]);
        }

        //calculate
        randomAve = Stats.getMean(randomGoid);
        randomStd = Stats.getStdDev(randomGoid);
        zScore = Stats.getZscore(randomAve, randomStd, clusterGoid);
        result[0] = clusterGoid;
        result[1] = zScore;
        return result;

    }

     private double[] getGoIDFromFunc(String[] clusterNames, String lookupFile, String backgroundFile, String count) {
        //********************************
        // part 2, calculate RawGoID
        //********************************
        double[] result = new double[2];

        //initialize local variables:
        RawGoID myRawGoID = new RawGoID(clusterNames, lookupFile, backgroundFile);
        double clusterGoid = myRawGoID.getRawGoID();
        double randomAve = 0.0;
        double randomStd = 0.0;
        double zScore = 0.0;

        //    System.out.println("real cluster raw GOid =" + clusterGoid);

        // get 'repeat time' random rawGoIDs
        double[] randomGoid = new double[Integer.parseInt(count)];
        for (int i = 0; i < Integer.parseInt(count); i++) {
            randomGoid[i] = myRawGoID.getRandomRawGoID();
//            System.out.println("now is in loop :" + (i + 1));
//            System.out.println("randomGOid = " + randomGoid[i]);
        }

        //calculate
        randomAve = Stats.getMeanFromFunc(myRawGoID.getOriClusterSize());
        randomStd = Stats.getStdDevFromFunc(myRawGoID.getOriClusterSize());
        zScore = Stats.getZscore(randomAve, randomStd, clusterGoid);
        result[0] = clusterGoid;
        result[1] = zScore;
        return result;

    }
}//end of class


final class TreeNode {

    private String strName;
    private double dLikelihood;
    private Instances data;
    private TreeNode parent;
//	TreeNode child;

    /**
     * @param strName		name of node
     * @param likelihood	likelihood of the data
     * @param data			data set
     * @param parent		point to its parent node
     * @param child			point to its child node
     */
    public TreeNode(String strName, double likelihood, Instances data, TreeNode parent) {
        this.strName = strName;
        dLikelihood = likelihood;
        this.data = data;
        this.parent = parent;
    }

    /**
     * @return the data
     */
    public Instances getData() {
        return data;
    }

    /**
     * @param data the data to set
     */
    public void setData(Instances data) {
        this.data = data;
    }

    /**
     * @return the dLikelihood
     */
    public double getDLikelihood() {
        return dLikelihood;
    }

    /**
     * @param likelihood the dLikelihood to set
     */
    public void setDLikelihood(double likelihood) {
        dLikelihood = likelihood;
    }

    /**
     * @return the parent
     */
    public TreeNode getParent() {
        return parent;
    }

    /**
     * @param parent the parent to set
     */
    public void setParent(TreeNode parent) {
        this.parent = parent;
    }

    /**
     * @return the strName
     */
    public String getStrName() {
        return strName;
    }

    /**
     * @param strName the strName to set
     */
    public void setStrName(String strName) {
        this.strName = strName;
    }
}