/**
* This program will take an input file(either in arff format or csv format).
* outout 3 files: one is the tree structure. another is the final table with
* all information, the last one is the summary information
*
*/
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.Vector;
import weka.clusterers.ClusterEvaluation;
import weka.clusterers.EM;
import weka.core.Instances;
public class GetClusters {
public GetClusters() {
}
/**
*
* @param root the tree node we need to cluster
* @param generation the depth of the tree
* @param position the breadth of the tree
* @param vecFinalTable contain the final table
*/
public int clustering(TreeNode root, int generation,
String position, Vector vecFinalTable,
Vector vecSummary, String outputFilename, OutputStreamWriter xmlWriter,
String lookupFile, String backgroundFile, String count, boolean fromFunction) {
int result = 0;
try {
FileOutputStream stream;// provides file access
OutputStreamWriter writer;// writes to the file
stream = new FileOutputStream(new File(outputFilename), true);
writer = new OutputStreamWriter(stream);
// ***** 1 create a copy of original data *****
Instances oriData = root.getData();
Instances data = new Instances(oriData);
// ***** 2 remove attribute: orf_name(string attribute) *****
// data.deleteAttributeAt(0);
data.deleteStringAttributes();
// ***** 3 clustering *****
EM clusterer = new EM(); // new instance of clusterer
clusterer.buildClusterer(data); // build the clusterer
// evaluate cluster
ClusterEvaluation eval = new ClusterEvaluation();
eval.setClusterer(clusterer); // the cluster to evaluate
eval.evaluateClusterer(data); // data to evaluate the clusterer on
//get the rawGoID and zScore for the
//to be continued. AAA
String[] clusterNames = getClusterNames(oriData);
double[] goID = null;
if(fromFunction){
goID = this.getGoID(clusterNames, lookupFile, backgroundFile,count);
}else{
goID = this.getGoIDFromFunc(clusterNames, lookupFile, backgroundFile,count);
}
double logLikelihood = eval.getLogLikelihood();
writer.write("logLikelihood is: " + logLikelihood + "\n");
writer.write("GoID is: " + goID[0] + "\n");
writer.write("zScore is: " + goID[1] + "\n\n");
writer.flush();
// ***** 4 get the sub clusters *****
int numberOfSubCluster = eval.getNumClusters();
if (numberOfSubCluster > 1) {// not an end node
// create numberOfSubCluster instances array to store sub
// clusters
Instances[] subData = new Instances[numberOfSubCluster];
TreeNode[] subNode = new TreeNode[numberOfSubCluster];
for (int i = 0; i < numberOfSubCluster; i++) {
subData[i] = new Instances(oriData);
subData[i].delete();// keep only data head(attributes part)
}
// //System.out.println("\nlength is: " + data.numInstances());
// //System.out.println("number of clusters: " +
// numberOfSubCluster);
// //System.out.println(eval.clusterResultsToString());
double[] dArray = eval.getClusterAssignments();
for (int i = 0; i < dArray.length; i++) {
int clusterNumber = (int) dArray[i];
// //System.out.println("\ngene " + i + " is in cluster: "
// + clusterNumber + ",\tlog likelihood is:"
// + eval.getLogLikelihood());
// //System.out.println("***************");
// assign each gene to according cluster
for (int j = 0; j < subData.length; j++) {
if (j == clusterNumber) {
subData[j].add(oriData.instance(i));
}
}// end of inner j loop
}// end of outter i loop
// ***** 5 recursive call *****
String uniName = "";
// for (int i = 0; i <= generation; i++) {
// uniName += "0";
// }
uniName += generation + "-" + position;
generation++;
for (int i = 0; i < numberOfSubCluster; i++) {
String name = uniName + "-" + i;
//System.out.println("\n******************************");
//System.out.println("cluster name: " + name);
writer.write("\n******************************\n");
writer.write("cluster name: " + name + "\n");
writer.flush();
xmlWriter.write(" \n \n");
xmlWriter.flush();
subNode[i] = new TreeNode(name, eval.getLogLikelihood(),
subData[i], root);
result += clustering(subNode[i], generation,
position + "." + i, vecFinalTable, vecSummary, outputFilename,
xmlWriter,lookupFile,backgroundFile,count,fromFunction);
xmlWriter.write(" \n");
xmlWriter.flush();
}// end of for loop
} else { //for leaf node
//System.out.println("leaf node");
result = 1;
int temp = 1;
if (!vecSummary.isEmpty()) {
String strT = (vecSummary.lastElement().toString()).split(",")[1];
temp = Integer.parseInt(strT.trim()) + 1;
}
writer.write("leaf node\n");
writer.flush();
for (int i = 0; i < root.getData().numInstances(); i++) {
String strTemp = eval.getLogLikelihood() + "," + root.getData().instance(i) + "," + getAncestor(root, false) + "," + temp;
//System.out.println( strTemp);
writer.write(strTemp + "\n");
writer.flush();
xmlWriter.write("\n \n\n");
xmlWriter.flush();
vecFinalTable.addElement(strTemp);
}
vecSummary.addElement(getAncestor(root, false).toString() + "," + temp + "," + root.getData().numInstances() + "," + logLikelihood);
//System.out.println("******************************\n");
writer.write("******************************\n");
writer.flush();
generation--;
}//end of else
writer.close();
stream.close();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return result;
}//end of method "clustering"
/**
* output the root cluster name to file
* @param fileName output file name
* @param rootName thr root cluster name
*/
public void printRootName(String fileName, String rootName) {
try {
FileOutputStream stream;// provides file access
OutputStreamWriter writer;// writes to the file
stream = new FileOutputStream(new File(fileName), true);
writer = new OutputStreamWriter(stream);
writer.write("root cluster is:" + rootName + "\n");
writer.flush();
writer.close();
stream.close();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
/**
* print out the instance part of the data into a CSV formated table.
*
* @param data: the printed data set
*/
public String printTableHead(Instances data) {
String strResult = "likelihood";
for (int i = 0; i < data.numAttributes(); i++) {
String strTemp = "";
String[] strArr = data.attribute(i).toString().split("\\ ");
for (int j = 1; j < strArr.length - 1; j++) {
strTemp += strArr[j];
}
strResult += "," + strTemp;
}
return strResult + ",cluster origin,cluster ID";
}//end of method "printTalbe"
/**
* print the vector
* @param vec
*/
public void printVector(Vector vec, String outputFilename) {
//System.out.println("\n***************************");
//System.out.println("*** final result ***");
//System.out.println("***************************");
try {
FileOutputStream stream;// provides file access
OutputStreamWriter writer;// writes to the file
stream = new FileOutputStream(new File(outputFilename), false);
writer = new OutputStreamWriter(stream);
for (int i = 0; i < vec.size(); i++) {
//System.out.println(vec.elementAt(i));
writer.write(vec.elementAt(i).toString() + "\n");
}
writer.close();
stream.close();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
//System.out.println("\n***************************");
//System.out.println("*** end of final result ***");
//System.out.println("***************************");
}
/**
*
* @param endNode an leaf node
* @return a string contains all the ancestor's name of the node
*/
public String getAncestor(TreeNode endNode, boolean fromLeafNode) {
String strResult = endNode.getStrName();
TreeNode tempNode = endNode;
while (tempNode.getParent() != null) {
tempNode = tempNode.getParent();
strResult += "; " + tempNode.getStrName();
}
if (fromLeafNode) {
return strResult;
} else {
String newResult = "";
String[] history = strResult.split("\\;");
for (int i = history.length; i > 0; i--) {
newResult += history[i - 1] + "; ";
}
return newResult;
}
}
/**
* check the number of the arguments:
* java GetCluster arg1 arg2 ...
*
* @param length the length of the arguments
* in this program, length should be 1
*/
public void checkParameters(int length) {
if (length != 1) {
System.out.println("Usage: java GetCluster inputFileName");
System.exit(1);
}
}
/**
*
* @param inputFileName the name of the input file name
* @return an Instances of Weka Instances
*/
public Instances input(String inputFileName) {
String[] inputName = inputFileName.split("\\.");
Instances oriData = null;
try {
if (inputName[inputName.length - 1].compareToIgnoreCase("csv") == 0) {
// read from csv file
readCSV(inputFileName);
FileReader f = new FileReader(inputFileName + ".arff");
BufferedReader b = new BufferedReader(f);
oriData = new Instances(b);
} else if (inputName[inputName.length - 1].compareToIgnoreCase("arff") == 0) {
// read from arff data
FileReader f = new FileReader(inputFileName);
BufferedReader b = new BufferedReader(f);
oriData = new Instances(b);
} else {
System.out.println("only .arff or .csv format allowed!");
System.exit(1);
}
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return oriData;
}
/**
* read a csv file and convert to a arff file
* @param inputName the name of the csv file
*/
public void readCSV(String inputName) {
try {
FileReader fr = new FileReader(inputName);
BufferedReader br = new BufferedReader(fr);
FileOutputStream stream;// provides file access
OutputStreamWriter writer;// writes to the file
stream = new FileOutputStream(new File(inputName + ".arff"), false);
writer = new OutputStreamWriter(stream);
String strLine = br.readLine();
String[] varNameArray = strLine.split("\\,");
writer.write("@RELATION dataset" + "\n\n");
for (int i = 0; i < varNameArray.length; i++) {
if (i < 2) {
writer.write("@ATTRIBUTE" + " " + "\"" + varNameArray[i] + "\"" + " " + "string" + "\n");
} else {
writer.write("@ATTRIBUTE" + " " + "\"" + varNameArray[i] + "\"" + " " + "numeric" + "\n");
}
}
writer.write("\n@DATA\n");
while ((strLine = br.readLine()) != null) {
writer.write(strLine + "\n");
}
writer.close();
stream.close();
fr.close();
br.close();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
/**
*
* @param data
* @return an array contains the first element
* of each instance of input data
*/
private String[] getClusterNames(Instances data) {
String[] result = new String[data.numInstances()];
for (int i = 0; i < result.length; i++) {
String[] strArray = data.instance(i).toString().split("\\,");
result[i] = strArray[0];
}
return result;
}
private double[] getGoID(String[] clusterNames, String lookupFile, String backgroundFile, String count) {
//********************************
// part 2, calculate RawGoID
//********************************
double[] result = new double[2];
//initialize local variables:
RawGoID myRawGoID = new RawGoID(clusterNames, lookupFile, backgroundFile);
double clusterGoid = myRawGoID.getRawGoID();
double randomAve = 0.0;
double randomStd = 0.0;
double zScore = 0.0;
// System.out.println("real cluster raw GOid =" + clusterGoid);
// get 'repeat time' random rawGoIDs
double[] randomGoid = new double[Integer.parseInt(count)];
for (int i = 0; i < Integer.parseInt(count); i++) {
randomGoid[i] = myRawGoID.getRandomRawGoID();
// System.out.println("now is in loop :" + (i + 1));
// System.out.println("randomGOid = " + randomGoid[i]);
}
//calculate
randomAve = Stats.getMean(randomGoid);
randomStd = Stats.getStdDev(randomGoid);
zScore = Stats.getZscore(randomAve, randomStd, clusterGoid);
result[0] = clusterGoid;
result[1] = zScore;
return result;
}
private double[] getGoIDFromFunc(String[] clusterNames, String lookupFile, String backgroundFile, String count) {
//********************************
// part 2, calculate RawGoID
//********************************
double[] result = new double[2];
//initialize local variables:
RawGoID myRawGoID = new RawGoID(clusterNames, lookupFile, backgroundFile);
double clusterGoid = myRawGoID.getRawGoID();
double randomAve = 0.0;
double randomStd = 0.0;
double zScore = 0.0;
// System.out.println("real cluster raw GOid =" + clusterGoid);
// get 'repeat time' random rawGoIDs
double[] randomGoid = new double[Integer.parseInt(count)];
for (int i = 0; i < Integer.parseInt(count); i++) {
randomGoid[i] = myRawGoID.getRandomRawGoID();
// System.out.println("now is in loop :" + (i + 1));
// System.out.println("randomGOid = " + randomGoid[i]);
}
//calculate
randomAve = Stats.getMeanFromFunc(myRawGoID.getOriClusterSize());
randomStd = Stats.getStdDevFromFunc(myRawGoID.getOriClusterSize());
zScore = Stats.getZscore(randomAve, randomStd, clusterGoid);
result[0] = clusterGoid;
result[1] = zScore;
return result;
}
}//end of class
final class TreeNode {
private String strName;
private double dLikelihood;
private Instances data;
private TreeNode parent;
// TreeNode child;
/**
* @param strName name of node
* @param likelihood likelihood of the data
* @param data data set
* @param parent point to its parent node
* @param child point to its child node
*/
public TreeNode(String strName, double likelihood, Instances data, TreeNode parent) {
this.strName = strName;
dLikelihood = likelihood;
this.data = data;
this.parent = parent;
}
/**
* @return the data
*/
public Instances getData() {
return data;
}
/**
* @param data the data to set
*/
public void setData(Instances data) {
this.data = data;
}
/**
* @return the dLikelihood
*/
public double getDLikelihood() {
return dLikelihood;
}
/**
* @param likelihood the dLikelihood to set
*/
public void setDLikelihood(double likelihood) {
dLikelihood = likelihood;
}
/**
* @return the parent
*/
public TreeNode getParent() {
return parent;
}
/**
* @param parent the parent to set
*/
public void setParent(TreeNode parent) {
this.parent = parent;
}
/**
* @return the strName
*/
public String getStrName() {
return strName;
}
/**
* @param strName the strName to set
*/
public void setStrName(String strName) {
this.strName = strName;
}
}