/** * This program will take an input file(either in arff format or csv format). * outout 3 files: one is the tree structure. another is the final table with * all information, the last one is the summary information * */ import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; import java.io.OutputStreamWriter; import java.util.Vector; import weka.clusterers.ClusterEvaluation; import weka.clusterers.EM; import weka.core.Instances; public class GetClusters { public GetClusters() { } /** * * @param root the tree node we need to cluster * @param generation the depth of the tree * @param position the breadth of the tree * @param vecFinalTable contain the final table */ public int clustering(TreeNode root, int generation, String position, Vector vecFinalTable, Vector vecSummary, String outputFilename, OutputStreamWriter xmlWriter, String lookupFile, String backgroundFile, String count, boolean fromFunction) { int result = 0; try { FileOutputStream stream;// provides file access OutputStreamWriter writer;// writes to the file stream = new FileOutputStream(new File(outputFilename), true); writer = new OutputStreamWriter(stream); // ***** 1 create a copy of original data ***** Instances oriData = root.getData(); Instances data = new Instances(oriData); // ***** 2 remove attribute: orf_name(string attribute) ***** // data.deleteAttributeAt(0); data.deleteStringAttributes(); // ***** 3 clustering ***** EM clusterer = new EM(); // new instance of clusterer clusterer.buildClusterer(data); // build the clusterer // evaluate cluster ClusterEvaluation eval = new ClusterEvaluation(); eval.setClusterer(clusterer); // the cluster to evaluate eval.evaluateClusterer(data); // data to evaluate the clusterer on //get the rawGoID and zScore for the //to be continued. AAA String[] clusterNames = getClusterNames(oriData); double[] goID = null; if(fromFunction){ goID = this.getGoID(clusterNames, lookupFile, backgroundFile,count); }else{ goID = this.getGoIDFromFunc(clusterNames, lookupFile, backgroundFile,count); } double logLikelihood = eval.getLogLikelihood(); writer.write("logLikelihood is: " + logLikelihood + "\n"); writer.write("GoID is: " + goID[0] + "\n"); writer.write("zScore is: " + goID[1] + "\n\n"); writer.flush(); // ***** 4 get the sub clusters ***** int numberOfSubCluster = eval.getNumClusters(); if (numberOfSubCluster > 1) {// not an end node // create numberOfSubCluster instances array to store sub // clusters Instances[] subData = new Instances[numberOfSubCluster]; TreeNode[] subNode = new TreeNode[numberOfSubCluster]; for (int i = 0; i < numberOfSubCluster; i++) { subData[i] = new Instances(oriData); subData[i].delete();// keep only data head(attributes part) } // //System.out.println("\nlength is: " + data.numInstances()); // //System.out.println("number of clusters: " + // numberOfSubCluster); // //System.out.println(eval.clusterResultsToString()); double[] dArray = eval.getClusterAssignments(); for (int i = 0; i < dArray.length; i++) { int clusterNumber = (int) dArray[i]; // //System.out.println("\ngene " + i + " is in cluster: " // + clusterNumber + ",\tlog likelihood is:" // + eval.getLogLikelihood()); // //System.out.println("***************"); // assign each gene to according cluster for (int j = 0; j < subData.length; j++) { if (j == clusterNumber) { subData[j].add(oriData.instance(i)); } }// end of inner j loop }// end of outter i loop // ***** 5 recursive call ***** String uniName = ""; // for (int i = 0; i <= generation; i++) { // uniName += "0"; // } uniName += generation + "-" + position; generation++; for (int i = 0; i < numberOfSubCluster; i++) { String name = uniName + "-" + i; //System.out.println("\n******************************"); //System.out.println("cluster name: " + name); writer.write("\n******************************\n"); writer.write("cluster name: " + name + "\n"); writer.flush(); xmlWriter.write(" \n \n"); xmlWriter.flush(); subNode[i] = new TreeNode(name, eval.getLogLikelihood(), subData[i], root); result += clustering(subNode[i], generation, position + "." + i, vecFinalTable, vecSummary, outputFilename, xmlWriter,lookupFile,backgroundFile,count,fromFunction); xmlWriter.write(" \n"); xmlWriter.flush(); }// end of for loop } else { //for leaf node //System.out.println("leaf node"); result = 1; int temp = 1; if (!vecSummary.isEmpty()) { String strT = (vecSummary.lastElement().toString()).split(",")[1]; temp = Integer.parseInt(strT.trim()) + 1; } writer.write("leaf node\n"); writer.flush(); for (int i = 0; i < root.getData().numInstances(); i++) { String strTemp = eval.getLogLikelihood() + "," + root.getData().instance(i) + "," + getAncestor(root, false) + "," + temp; //System.out.println( strTemp); writer.write(strTemp + "\n"); writer.flush(); xmlWriter.write("\n \n\n"); xmlWriter.flush(); vecFinalTable.addElement(strTemp); } vecSummary.addElement(getAncestor(root, false).toString() + "," + temp + "," + root.getData().numInstances() + "," + logLikelihood); //System.out.println("******************************\n"); writer.write("******************************\n"); writer.flush(); generation--; }//end of else writer.close(); stream.close(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } return result; }//end of method "clustering" /** * output the root cluster name to file * @param fileName output file name * @param rootName thr root cluster name */ public void printRootName(String fileName, String rootName) { try { FileOutputStream stream;// provides file access OutputStreamWriter writer;// writes to the file stream = new FileOutputStream(new File(fileName), true); writer = new OutputStreamWriter(stream); writer.write("root cluster is:" + rootName + "\n"); writer.flush(); writer.close(); stream.close(); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } /** * print out the instance part of the data into a CSV formated table. * * @param data: the printed data set */ public String printTableHead(Instances data) { String strResult = "likelihood"; for (int i = 0; i < data.numAttributes(); i++) { String strTemp = ""; String[] strArr = data.attribute(i).toString().split("\\ "); for (int j = 1; j < strArr.length - 1; j++) { strTemp += strArr[j]; } strResult += "," + strTemp; } return strResult + ",cluster origin,cluster ID"; }//end of method "printTalbe" /** * print the vector * @param vec */ public void printVector(Vector vec, String outputFilename) { //System.out.println("\n***************************"); //System.out.println("*** final result ***"); //System.out.println("***************************"); try { FileOutputStream stream;// provides file access OutputStreamWriter writer;// writes to the file stream = new FileOutputStream(new File(outputFilename), false); writer = new OutputStreamWriter(stream); for (int i = 0; i < vec.size(); i++) { //System.out.println(vec.elementAt(i)); writer.write(vec.elementAt(i).toString() + "\n"); } writer.close(); stream.close(); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } //System.out.println("\n***************************"); //System.out.println("*** end of final result ***"); //System.out.println("***************************"); } /** * * @param endNode an leaf node * @return a string contains all the ancestor's name of the node */ public String getAncestor(TreeNode endNode, boolean fromLeafNode) { String strResult = endNode.getStrName(); TreeNode tempNode = endNode; while (tempNode.getParent() != null) { tempNode = tempNode.getParent(); strResult += "; " + tempNode.getStrName(); } if (fromLeafNode) { return strResult; } else { String newResult = ""; String[] history = strResult.split("\\;"); for (int i = history.length; i > 0; i--) { newResult += history[i - 1] + "; "; } return newResult; } } /** * check the number of the arguments: * java GetCluster arg1 arg2 ... * * @param length the length of the arguments * in this program, length should be 1 */ public void checkParameters(int length) { if (length != 1) { System.out.println("Usage: java GetCluster inputFileName"); System.exit(1); } } /** * * @param inputFileName the name of the input file name * @return an Instances of Weka Instances */ public Instances input(String inputFileName) { String[] inputName = inputFileName.split("\\."); Instances oriData = null; try { if (inputName[inputName.length - 1].compareToIgnoreCase("csv") == 0) { // read from csv file readCSV(inputFileName); FileReader f = new FileReader(inputFileName + ".arff"); BufferedReader b = new BufferedReader(f); oriData = new Instances(b); } else if (inputName[inputName.length - 1].compareToIgnoreCase("arff") == 0) { // read from arff data FileReader f = new FileReader(inputFileName); BufferedReader b = new BufferedReader(f); oriData = new Instances(b); } else { System.out.println("only .arff or .csv format allowed!"); System.exit(1); } } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return oriData; } /** * read a csv file and convert to a arff file * @param inputName the name of the csv file */ public void readCSV(String inputName) { try { FileReader fr = new FileReader(inputName); BufferedReader br = new BufferedReader(fr); FileOutputStream stream;// provides file access OutputStreamWriter writer;// writes to the file stream = new FileOutputStream(new File(inputName + ".arff"), false); writer = new OutputStreamWriter(stream); String strLine = br.readLine(); String[] varNameArray = strLine.split("\\,"); writer.write("@RELATION dataset" + "\n\n"); for (int i = 0; i < varNameArray.length; i++) { if (i < 2) { writer.write("@ATTRIBUTE" + " " + "\"" + varNameArray[i] + "\"" + " " + "string" + "\n"); } else { writer.write("@ATTRIBUTE" + " " + "\"" + varNameArray[i] + "\"" + " " + "numeric" + "\n"); } } writer.write("\n@DATA\n"); while ((strLine = br.readLine()) != null) { writer.write(strLine + "\n"); } writer.close(); stream.close(); fr.close(); br.close(); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } /** * * @param data * @return an array contains the first element * of each instance of input data */ private String[] getClusterNames(Instances data) { String[] result = new String[data.numInstances()]; for (int i = 0; i < result.length; i++) { String[] strArray = data.instance(i).toString().split("\\,"); result[i] = strArray[0]; } return result; } private double[] getGoID(String[] clusterNames, String lookupFile, String backgroundFile, String count) { //******************************** // part 2, calculate RawGoID //******************************** double[] result = new double[2]; //initialize local variables: RawGoID myRawGoID = new RawGoID(clusterNames, lookupFile, backgroundFile); double clusterGoid = myRawGoID.getRawGoID(); double randomAve = 0.0; double randomStd = 0.0; double zScore = 0.0; // System.out.println("real cluster raw GOid =" + clusterGoid); // get 'repeat time' random rawGoIDs double[] randomGoid = new double[Integer.parseInt(count)]; for (int i = 0; i < Integer.parseInt(count); i++) { randomGoid[i] = myRawGoID.getRandomRawGoID(); // System.out.println("now is in loop :" + (i + 1)); // System.out.println("randomGOid = " + randomGoid[i]); } //calculate randomAve = Stats.getMean(randomGoid); randomStd = Stats.getStdDev(randomGoid); zScore = Stats.getZscore(randomAve, randomStd, clusterGoid); result[0] = clusterGoid; result[1] = zScore; return result; } private double[] getGoIDFromFunc(String[] clusterNames, String lookupFile, String backgroundFile, String count) { //******************************** // part 2, calculate RawGoID //******************************** double[] result = new double[2]; //initialize local variables: RawGoID myRawGoID = new RawGoID(clusterNames, lookupFile, backgroundFile); double clusterGoid = myRawGoID.getRawGoID(); double randomAve = 0.0; double randomStd = 0.0; double zScore = 0.0; // System.out.println("real cluster raw GOid =" + clusterGoid); // get 'repeat time' random rawGoIDs double[] randomGoid = new double[Integer.parseInt(count)]; for (int i = 0; i < Integer.parseInt(count); i++) { randomGoid[i] = myRawGoID.getRandomRawGoID(); // System.out.println("now is in loop :" + (i + 1)); // System.out.println("randomGOid = " + randomGoid[i]); } //calculate randomAve = Stats.getMeanFromFunc(myRawGoID.getOriClusterSize()); randomStd = Stats.getStdDevFromFunc(myRawGoID.getOriClusterSize()); zScore = Stats.getZscore(randomAve, randomStd, clusterGoid); result[0] = clusterGoid; result[1] = zScore; return result; } }//end of class final class TreeNode { private String strName; private double dLikelihood; private Instances data; private TreeNode parent; // TreeNode child; /** * @param strName name of node * @param likelihood likelihood of the data * @param data data set * @param parent point to its parent node * @param child point to its child node */ public TreeNode(String strName, double likelihood, Instances data, TreeNode parent) { this.strName = strName; dLikelihood = likelihood; this.data = data; this.parent = parent; } /** * @return the data */ public Instances getData() { return data; } /** * @param data the data to set */ public void setData(Instances data) { this.data = data; } /** * @return the dLikelihood */ public double getDLikelihood() { return dLikelihood; } /** * @param likelihood the dLikelihood to set */ public void setDLikelihood(double likelihood) { dLikelihood = likelihood; } /** * @return the parent */ public TreeNode getParent() { return parent; } /** * @param parent the parent to set */ public void setParent(TreeNode parent) { this.parent = parent; } /** * @return the strName */ public String getStrName() { return strName; } /** * @param strName the strName to set */ public void setStrName(String strName) { this.strName = strName; } }