123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532 |
- /**
- * This program will take an input file(either in arff format or csv format).
- * outout 3 files: one is the tree structure. another is the final table with
- * all information, the last one is the summary information
- *
- */
- import java.io.BufferedReader;
- import java.io.File;
- import java.io.FileNotFoundException;
- import java.io.FileOutputStream;
- import java.io.FileReader;
- import java.io.IOException;
- import java.io.OutputStreamWriter;
- import java.util.Vector;
- import weka.clusterers.ClusterEvaluation;
- import weka.clusterers.EM;
- import weka.core.Instances;
- public class GetClusters {
- public GetClusters() {
- }
- /**
- *
- * @param root the tree node we need to cluster
- * @param generation the depth of the tree
- * @param position the breadth of the tree
- * @param vecFinalTable contain the final table
- */
- public int clustering(TreeNode root, int generation,
- String position, Vector vecFinalTable,
- Vector vecSummary, String outputFilename, OutputStreamWriter xmlWriter,
- String lookupFile, String backgroundFile, String count, boolean fromFunction) {
- int result = 0;
- try {
- FileOutputStream stream;// provides file access
- OutputStreamWriter writer;// writes to the file
- stream = new FileOutputStream(new File(outputFilename), true);
- writer = new OutputStreamWriter(stream);
- // ***** 1 create a copy of original data *****
- Instances oriData = root.getData();
- Instances data = new Instances(oriData);
- // ***** 2 remove attribute: orf_name(string attribute) *****
- // data.deleteAttributeAt(0);
- data.deleteStringAttributes();
- // ***** 3 clustering *****
- EM clusterer = new EM(); // new instance of clusterer
- clusterer.buildClusterer(data); // build the clusterer
- // evaluate cluster
- ClusterEvaluation eval = new ClusterEvaluation();
- eval.setClusterer(clusterer); // the cluster to evaluate
- eval.evaluateClusterer(data); // data to evaluate the clusterer on
- //get the rawGoID and zScore for the
- //to be continued. AAA
- String[] clusterNames = getClusterNames(oriData);
- double[] goID = null;
- if(fromFunction){
- goID = this.getGoID(clusterNames, lookupFile, backgroundFile,count);
- }else{
- goID = this.getGoIDFromFunc(clusterNames, lookupFile, backgroundFile,count);
- }
- double logLikelihood = eval.getLogLikelihood();
- writer.write("logLikelihood is: " + logLikelihood + "\n");
- writer.write("GoID is: " + goID[0] + "\n");
- writer.write("zScore is: " + goID[1] + "\n\n");
- writer.flush();
- // ***** 4 get the sub clusters *****
- int numberOfSubCluster = eval.getNumClusters();
- if (numberOfSubCluster > 1) {// not an end node
- // create numberOfSubCluster instances array to store sub
- // clusters
- Instances[] subData = new Instances[numberOfSubCluster];
- TreeNode[] subNode = new TreeNode[numberOfSubCluster];
- for (int i = 0; i < numberOfSubCluster; i++) {
- subData[i] = new Instances(oriData);
- subData[i].delete();// keep only data head(attributes part)
- }
- // //System.out.println("\nlength is: " + data.numInstances());
- // //System.out.println("number of clusters: " +
- // numberOfSubCluster);
- // //System.out.println(eval.clusterResultsToString());
- double[] dArray = eval.getClusterAssignments();
- for (int i = 0; i < dArray.length; i++) {
- int clusterNumber = (int) dArray[i];
- // //System.out.println("\ngene " + i + " is in cluster: "
- // + clusterNumber + ",\tlog likelihood is:"
- // + eval.getLogLikelihood());
- // //System.out.println("***************");
- // assign each gene to according cluster
- for (int j = 0; j < subData.length; j++) {
- if (j == clusterNumber) {
- subData[j].add(oriData.instance(i));
- }
- }// end of inner j loop
- }// end of outter i loop
- // ***** 5 recursive call *****
- String uniName = "";
- // for (int i = 0; i <= generation; i++) {
- // uniName += "0";
- // }
- uniName += generation + "-" + position;
- generation++;
- for (int i = 0; i < numberOfSubCluster; i++) {
- String name = uniName + "-" + i;
- //System.out.println("\n******************************");
- //System.out.println("cluster name: " + name);
- writer.write("\n******************************\n");
- writer.write("cluster name: " + name + "\n");
- writer.flush();
- xmlWriter.write(" <branch>\n <attribute name=\"name\" value=\"" + name + "\"/>\n");
- xmlWriter.flush();
- subNode[i] = new TreeNode(name, eval.getLogLikelihood(),
- subData[i], root);
- result += clustering(subNode[i], generation,
- position + "." + i, vecFinalTable, vecSummary, outputFilename,
- xmlWriter,lookupFile,backgroundFile,count,fromFunction);
- xmlWriter.write(" </branch>\n");
- xmlWriter.flush();
- }// end of for loop
- } else { //for leaf node
- //System.out.println("leaf node");
- result = 1;
- int temp = 1;
- if (!vecSummary.isEmpty()) {
- String strT = (vecSummary.lastElement().toString()).split(",")[1];
- temp = Integer.parseInt(strT.trim()) + 1;
- }
- writer.write("leaf node\n");
- writer.flush();
- for (int i = 0; i < root.getData().numInstances(); i++) {
- String strTemp = eval.getLogLikelihood() + "," + root.getData().instance(i) + "," + getAncestor(root, false) + "," + temp;
- //System.out.println( strTemp);
- writer.write(strTemp + "\n");
- writer.flush();
- xmlWriter.write("<leaf>\n <attribute name=\"name\" value=\"" + root.getData().instance(i).stringValue(0) + "\"/>\n</leaf>\n");
- xmlWriter.flush();
- vecFinalTable.addElement(strTemp);
- }
- vecSummary.addElement(getAncestor(root, false).toString() + "," + temp + "," + root.getData().numInstances() + "," + logLikelihood);
- //System.out.println("******************************\n");
- writer.write("******************************\n");
- writer.flush();
- generation--;
- }//end of else
- writer.close();
- stream.close();
- } catch (Exception e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- return result;
- }//end of method "clustering"
- /**
- * output the root cluster name to file
- * @param fileName output file name
- * @param rootName thr root cluster name
- */
- public void printRootName(String fileName, String rootName) {
- try {
- FileOutputStream stream;// provides file access
- OutputStreamWriter writer;// writes to the file
- stream = new FileOutputStream(new File(fileName), true);
- writer = new OutputStreamWriter(stream);
- writer.write("root cluster is:" + rootName + "\n");
- writer.flush();
- writer.close();
- stream.close();
- } catch (FileNotFoundException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- } catch (IOException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- }
- /**
- * print out the instance part of the data into a CSV formated table.
- *
- * @param data: the printed data set
- */
- public String printTableHead(Instances data) {
- String strResult = "likelihood";
- for (int i = 0; i < data.numAttributes(); i++) {
- String strTemp = "";
- String[] strArr = data.attribute(i).toString().split("\\ ");
- for (int j = 1; j < strArr.length - 1; j++) {
- strTemp += strArr[j];
- }
- strResult += "," + strTemp;
- }
- return strResult + ",cluster origin,cluster ID";
- }//end of method "printTalbe"
- /**
- * print the vector
- * @param vec
- */
- public void printVector(Vector vec, String outputFilename) {
- //System.out.println("\n***************************");
- //System.out.println("*** final result ***");
- //System.out.println("***************************");
- try {
- FileOutputStream stream;// provides file access
- OutputStreamWriter writer;// writes to the file
- stream = new FileOutputStream(new File(outputFilename), false);
- writer = new OutputStreamWriter(stream);
- for (int i = 0; i < vec.size(); i++) {
- //System.out.println(vec.elementAt(i));
- writer.write(vec.elementAt(i).toString() + "\n");
- }
- writer.close();
- stream.close();
- } catch (FileNotFoundException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- } catch (IOException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- //System.out.println("\n***************************");
- //System.out.println("*** end of final result ***");
- //System.out.println("***************************");
- }
- /**
- *
- * @param endNode an leaf node
- * @return a string contains all the ancestor's name of the node
- */
- public String getAncestor(TreeNode endNode, boolean fromLeafNode) {
- String strResult = endNode.getStrName();
- TreeNode tempNode = endNode;
- while (tempNode.getParent() != null) {
- tempNode = tempNode.getParent();
- strResult += "; " + tempNode.getStrName();
- }
- if (fromLeafNode) {
- return strResult;
- } else {
- String newResult = "";
- String[] history = strResult.split("\\;");
- for (int i = history.length; i > 0; i--) {
- newResult += history[i - 1] + "; ";
- }
- return newResult;
- }
- }
- /**
- * check the number of the arguments:
- * java GetCluster arg1 arg2 ...
- *
- * @param length the length of the arguments
- * in this program, length should be 1
- */
- public void checkParameters(int length) {
- if (length != 1) {
- System.out.println("Usage: java GetCluster inputFileName");
- System.exit(1);
- }
- }
- /**
- *
- * @param inputFileName the name of the input file name
- * @return an Instances of Weka Instances
- */
- public Instances input(String inputFileName) {
- String[] inputName = inputFileName.split("\\.");
- Instances oriData = null;
- try {
- if (inputName[inputName.length - 1].compareToIgnoreCase("csv") == 0) {
- // read from csv file
- readCSV(inputFileName);
- FileReader f = new FileReader(inputFileName + ".arff");
- BufferedReader b = new BufferedReader(f);
- oriData = new Instances(b);
- } else if (inputName[inputName.length - 1].compareToIgnoreCase("arff") == 0) {
- // read from arff data
- FileReader f = new FileReader(inputFileName);
- BufferedReader b = new BufferedReader(f);
- oriData = new Instances(b);
- } else {
- System.out.println("only .arff or .csv format allowed!");
- System.exit(1);
- }
- } catch (FileNotFoundException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- } catch (IOException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- return oriData;
- }
- /**
- * read a csv file and convert to a arff file
- * @param inputName the name of the csv file
- */
- public void readCSV(String inputName) {
- try {
- FileReader fr = new FileReader(inputName);
- BufferedReader br = new BufferedReader(fr);
- FileOutputStream stream;// provides file access
- OutputStreamWriter writer;// writes to the file
- stream = new FileOutputStream(new File(inputName + ".arff"), false);
- writer = new OutputStreamWriter(stream);
- String strLine = br.readLine();
- String[] varNameArray = strLine.split("\\,");
- writer.write("@RELATION dataset" + "\n\n");
- for (int i = 0; i < varNameArray.length; i++) {
- if (i < 2) {
- writer.write("@ATTRIBUTE" + " " + "\"" + varNameArray[i] + "\"" + " " + "string" + "\n");
- } else {
- writer.write("@ATTRIBUTE" + " " + "\"" + varNameArray[i] + "\"" + " " + "numeric" + "\n");
- }
- }
- writer.write("\n@DATA\n");
- while ((strLine = br.readLine()) != null) {
- writer.write(strLine + "\n");
- }
- writer.close();
- stream.close();
- fr.close();
- br.close();
- } catch (FileNotFoundException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- } catch (IOException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- }
- /**
- *
- * @param data
- * @return an array contains the first element
- * of each instance of input data
- */
- private String[] getClusterNames(Instances data) {
- String[] result = new String[data.numInstances()];
- for (int i = 0; i < result.length; i++) {
- String[] strArray = data.instance(i).toString().split("\\,");
- result[i] = strArray[0];
- }
- return result;
- }
- private double[] getGoID(String[] clusterNames, String lookupFile, String backgroundFile, String count) {
- //********************************
- // part 2, calculate RawGoID
- //********************************
- double[] result = new double[2];
- //initialize local variables:
- RawGoID myRawGoID = new RawGoID(clusterNames, lookupFile, backgroundFile);
- double clusterGoid = myRawGoID.getRawGoID();
- double randomAve = 0.0;
- double randomStd = 0.0;
- double zScore = 0.0;
- // System.out.println("real cluster raw GOid =" + clusterGoid);
- // get 'repeat time' random rawGoIDs
- double[] randomGoid = new double[Integer.parseInt(count)];
- for (int i = 0; i < Integer.parseInt(count); i++) {
- randomGoid[i] = myRawGoID.getRandomRawGoID();
- // System.out.println("now is in loop :" + (i + 1));
- // System.out.println("randomGOid = " + randomGoid[i]);
- }
- //calculate
- randomAve = Stats.getMean(randomGoid);
- randomStd = Stats.getStdDev(randomGoid);
- zScore = Stats.getZscore(randomAve, randomStd, clusterGoid);
- result[0] = clusterGoid;
- result[1] = zScore;
- return result;
- }
- private double[] getGoIDFromFunc(String[] clusterNames, String lookupFile, String backgroundFile, String count) {
- //********************************
- // part 2, calculate RawGoID
- //********************************
- double[] result = new double[2];
- //initialize local variables:
- RawGoID myRawGoID = new RawGoID(clusterNames, lookupFile, backgroundFile);
- double clusterGoid = myRawGoID.getRawGoID();
- double randomAve = 0.0;
- double randomStd = 0.0;
- double zScore = 0.0;
- // System.out.println("real cluster raw GOid =" + clusterGoid);
- // get 'repeat time' random rawGoIDs
- double[] randomGoid = new double[Integer.parseInt(count)];
- for (int i = 0; i < Integer.parseInt(count); i++) {
- randomGoid[i] = myRawGoID.getRandomRawGoID();
- // System.out.println("now is in loop :" + (i + 1));
- // System.out.println("randomGOid = " + randomGoid[i]);
- }
- //calculate
- randomAve = Stats.getMeanFromFunc(myRawGoID.getOriClusterSize());
- randomStd = Stats.getStdDevFromFunc(myRawGoID.getOriClusterSize());
- zScore = Stats.getZscore(randomAve, randomStd, clusterGoid);
- result[0] = clusterGoid;
- result[1] = zScore;
- return result;
- }
- }//end of class
- final class TreeNode {
- private String strName;
- private double dLikelihood;
- private Instances data;
- private TreeNode parent;
- // TreeNode child;
- /**
- * @param strName name of node
- * @param likelihood likelihood of the data
- * @param data data set
- * @param parent point to its parent node
- * @param child point to its child node
- */
- public TreeNode(String strName, double likelihood, Instances data, TreeNode parent) {
- this.strName = strName;
- dLikelihood = likelihood;
- this.data = data;
- this.parent = parent;
- }
- /**
- * @return the data
- */
- public Instances getData() {
- return data;
- }
- /**
- * @param data the data to set
- */
- public void setData(Instances data) {
- this.data = data;
- }
- /**
- * @return the dLikelihood
- */
- public double getDLikelihood() {
- return dLikelihood;
- }
- /**
- * @param likelihood the dLikelihood to set
- */
- public void setDLikelihood(double likelihood) {
- dLikelihood = likelihood;
- }
- /**
- * @return the parent
- */
- public TreeNode getParent() {
- return parent;
- }
- /**
- * @param parent the parent to set
- */
- public void setParent(TreeNode parent) {
- this.parent = parent;
- }
- /**
- * @return the strName
- */
- public String getStrName() {
- return strName;
- }
- /**
- * @param strName the strName to set
- */
- public void setStrName(String strName) {
- this.strName = strName;
- }
- }
|