Squashed initial commit
This commit is contained in:
102
qhtcp-workflow/apps/java/weka-clustering/src/ExecMain.java
Executable file
102
qhtcp-workflow/apps/java/weka-clustering/src/ExecMain.java
Executable file
@@ -0,0 +1,102 @@
|
||||
/*
|
||||
* This is the main class to execute the computation and get the GOID.
|
||||
*/
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.util.Vector;
|
||||
import weka.core.Instances;
|
||||
|
||||
/**
|
||||
*
|
||||
* @author DTian
|
||||
*/
|
||||
public class ExecMain {
|
||||
|
||||
public static void main(String[] args) {
|
||||
|
||||
// SGD2AttrTable sgd = new SGD2AttrTable();
|
||||
// Get clusters
|
||||
|
||||
try {
|
||||
// Check the input arguments
|
||||
checkParameters(args.length);
|
||||
GetClusters myGetClusters = new GetClusters();
|
||||
|
||||
// To get the input file type
|
||||
Instances oriData = myGetClusters.input(args[0]);
|
||||
|
||||
// Get the output files name
|
||||
String outputWholeCluster = args[0] + "-WholeTree.txt";
|
||||
String outputFinalTable = args[0] + "-finalTable.csv";
|
||||
String outputSummary = args[0] + "-summary.csv";
|
||||
boolean fromFunction = true?false:args[4].trim().compareToIgnoreCase("true") == 0;
|
||||
|
||||
// Create the root cluster name
|
||||
int round = 1; //for tree level
|
||||
int position = 0; // for node position in same level
|
||||
String rootName = (round-1)+"-"+position+"-"+0;
|
||||
|
||||
// System.out.println("root cluster is:" + rootName);
|
||||
myGetClusters.printRootName(outputWholeCluster,rootName);
|
||||
|
||||
// Create a vector for fianl table
|
||||
Vector vecFinalTable = new Vector();
|
||||
Vector vecSummary = new Vector();
|
||||
|
||||
//get the variable name
|
||||
vecFinalTable.addElement(myGetClusters.printTableHead(oriData));
|
||||
|
||||
// Create the root node
|
||||
TreeNode root = new TreeNode(rootName,0.0,oriData,null);
|
||||
OutputStreamWriter xmlWriter = new OutputStreamWriter(
|
||||
new FileOutputStream(new File("tree.xml"), true));
|
||||
xmlWriter.write(" <tree>\n <declarations> \n <attributeDecl name=\"name\" type=\"String\"/>\n </declarations>\n");
|
||||
xmlWriter.write(" <branch>\n <attribute name=\"name\" value=\""+"root" +"\"/>\n");
|
||||
xmlWriter.flush();
|
||||
|
||||
// Recursive clustering the data
|
||||
myGetClusters.clustering(
|
||||
root,
|
||||
round,
|
||||
""+position,
|
||||
vecFinalTable,
|
||||
vecSummary,
|
||||
outputWholeCluster,
|
||||
xmlWriter,
|
||||
args[1],
|
||||
args[2],
|
||||
args[3],
|
||||
fromFunction
|
||||
);
|
||||
|
||||
xmlWriter.write(" </branch>\n");
|
||||
xmlWriter.write("</tree>\n");
|
||||
xmlWriter.close();
|
||||
|
||||
// Output final result
|
||||
myGetClusters.printVector(vecFinalTable,outputFinalTable);
|
||||
myGetClusters.printVector(vecSummary,outputSummary);
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
System.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* check the number of the arguments:
|
||||
* java GetCluster arg1 arg2 ...
|
||||
*
|
||||
* @param length the length of the arguments
|
||||
* in this program, length should be 1
|
||||
*/
|
||||
private static void checkParameters(int length) {
|
||||
if(length != 5) {
|
||||
// there are 5 parameters, 1,file for clustering; 2, lookup table file;
|
||||
// 3, backgroud file; 4, repeat counts(an integer) 5, GoIDfromFunction (boolean)
|
||||
System.out.println("Usage: java ExecMain clusterFileName GoMatrixFilename backGroundFilename repeatTime true|false");
|
||||
System.exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
532
qhtcp-workflow/apps/java/weka-clustering/src/GetClusters.java
Executable file
532
qhtcp-workflow/apps/java/weka-clustering/src/GetClusters.java
Executable file
@@ -0,0 +1,532 @@
|
||||
|
||||
/**
|
||||
* This program will take an input file(either in arff format or csv format).
|
||||
* outout 3 files: one is the tree structure. another is the final table with
|
||||
* all information, the last one is the summary information
|
||||
*
|
||||
*/
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.util.Vector;
|
||||
|
||||
import weka.clusterers.ClusterEvaluation;
|
||||
import weka.clusterers.EM;
|
||||
import weka.core.Instances;
|
||||
|
||||
public class GetClusters {
|
||||
|
||||
public GetClusters() {
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param root the tree node we need to cluster
|
||||
* @param generation the depth of the tree
|
||||
* @param position the breadth of the tree
|
||||
* @param vecFinalTable contain the final table
|
||||
*/
|
||||
public int clustering(TreeNode root, int generation,
|
||||
String position, Vector vecFinalTable,
|
||||
Vector vecSummary, String outputFilename, OutputStreamWriter xmlWriter,
|
||||
String lookupFile, String backgroundFile, String count, boolean fromFunction) {
|
||||
int result = 0;
|
||||
try {
|
||||
|
||||
FileOutputStream stream;// provides file access
|
||||
OutputStreamWriter writer;// writes to the file
|
||||
stream = new FileOutputStream(new File(outputFilename), true);
|
||||
writer = new OutputStreamWriter(stream);
|
||||
|
||||
// ***** 1 create a copy of original data *****
|
||||
Instances oriData = root.getData();
|
||||
Instances data = new Instances(oriData);
|
||||
|
||||
// ***** 2 remove attribute: orf_name(string attribute) *****
|
||||
// data.deleteAttributeAt(0);
|
||||
data.deleteStringAttributes();
|
||||
|
||||
// ***** 3 clustering *****
|
||||
EM clusterer = new EM(); // new instance of clusterer
|
||||
clusterer.buildClusterer(data); // build the clusterer
|
||||
|
||||
// evaluate cluster
|
||||
ClusterEvaluation eval = new ClusterEvaluation();
|
||||
eval.setClusterer(clusterer); // the cluster to evaluate
|
||||
eval.evaluateClusterer(data); // data to evaluate the clusterer on
|
||||
|
||||
//get the rawGoID and zScore for the
|
||||
//to be continued. AAA
|
||||
String[] clusterNames = getClusterNames(oriData);
|
||||
double[] goID = null;
|
||||
if(fromFunction){
|
||||
goID = this.getGoID(clusterNames, lookupFile, backgroundFile,count);
|
||||
}else{
|
||||
goID = this.getGoIDFromFunc(clusterNames, lookupFile, backgroundFile,count);
|
||||
}
|
||||
|
||||
double logLikelihood = eval.getLogLikelihood();
|
||||
writer.write("logLikelihood is: " + logLikelihood + "\n");
|
||||
writer.write("GoID is: " + goID[0] + "\n");
|
||||
writer.write("zScore is: " + goID[1] + "\n\n");
|
||||
writer.flush();
|
||||
|
||||
// ***** 4 get the sub clusters *****
|
||||
int numberOfSubCluster = eval.getNumClusters();
|
||||
if (numberOfSubCluster > 1) {// not an end node
|
||||
|
||||
// create numberOfSubCluster instances array to store sub
|
||||
// clusters
|
||||
Instances[] subData = new Instances[numberOfSubCluster];
|
||||
TreeNode[] subNode = new TreeNode[numberOfSubCluster];
|
||||
for (int i = 0; i < numberOfSubCluster; i++) {
|
||||
subData[i] = new Instances(oriData);
|
||||
subData[i].delete();// keep only data head(attributes part)
|
||||
}
|
||||
// //System.out.println("\nlength is: " + data.numInstances());
|
||||
// //System.out.println("number of clusters: " +
|
||||
// numberOfSubCluster);
|
||||
|
||||
// //System.out.println(eval.clusterResultsToString());
|
||||
double[] dArray = eval.getClusterAssignments();
|
||||
for (int i = 0; i < dArray.length; i++) {
|
||||
int clusterNumber = (int) dArray[i];
|
||||
// //System.out.println("\ngene " + i + " is in cluster: "
|
||||
// + clusterNumber + ",\tlog likelihood is:"
|
||||
// + eval.getLogLikelihood());
|
||||
// //System.out.println("***************");
|
||||
|
||||
// assign each gene to according cluster
|
||||
for (int j = 0; j < subData.length; j++) {
|
||||
if (j == clusterNumber) {
|
||||
subData[j].add(oriData.instance(i));
|
||||
}
|
||||
}// end of inner j loop
|
||||
}// end of outter i loop
|
||||
|
||||
|
||||
// ***** 5 recursive call *****
|
||||
String uniName = "";
|
||||
// for (int i = 0; i <= generation; i++) {
|
||||
// uniName += "0";
|
||||
// }
|
||||
uniName += generation + "-" + position;
|
||||
generation++;
|
||||
for (int i = 0; i < numberOfSubCluster; i++) {
|
||||
String name = uniName + "-" + i;
|
||||
//System.out.println("\n******************************");
|
||||
//System.out.println("cluster name: " + name);
|
||||
writer.write("\n******************************\n");
|
||||
writer.write("cluster name: " + name + "\n");
|
||||
writer.flush();
|
||||
xmlWriter.write(" <branch>\n <attribute name=\"name\" value=\"" + name + "\"/>\n");
|
||||
xmlWriter.flush();
|
||||
|
||||
subNode[i] = new TreeNode(name, eval.getLogLikelihood(),
|
||||
subData[i], root);
|
||||
result += clustering(subNode[i], generation,
|
||||
position + "." + i, vecFinalTable, vecSummary, outputFilename,
|
||||
xmlWriter,lookupFile,backgroundFile,count,fromFunction);
|
||||
xmlWriter.write(" </branch>\n");
|
||||
xmlWriter.flush();
|
||||
}// end of for loop
|
||||
} else { //for leaf node
|
||||
//System.out.println("leaf node");
|
||||
result = 1;
|
||||
int temp = 1;
|
||||
if (!vecSummary.isEmpty()) {
|
||||
String strT = (vecSummary.lastElement().toString()).split(",")[1];
|
||||
temp = Integer.parseInt(strT.trim()) + 1;
|
||||
}
|
||||
writer.write("leaf node\n");
|
||||
writer.flush();
|
||||
|
||||
for (int i = 0; i < root.getData().numInstances(); i++) {
|
||||
String strTemp = eval.getLogLikelihood() + "," + root.getData().instance(i) + "," + getAncestor(root, false) + "," + temp;
|
||||
//System.out.println( strTemp);
|
||||
writer.write(strTemp + "\n");
|
||||
writer.flush();
|
||||
xmlWriter.write("<leaf>\n <attribute name=\"name\" value=\"" + root.getData().instance(i).stringValue(0) + "\"/>\n</leaf>\n");
|
||||
xmlWriter.flush();
|
||||
vecFinalTable.addElement(strTemp);
|
||||
}
|
||||
|
||||
vecSummary.addElement(getAncestor(root, false).toString() + "," + temp + "," + root.getData().numInstances() + "," + logLikelihood);
|
||||
//System.out.println("******************************\n");
|
||||
writer.write("******************************\n");
|
||||
writer.flush();
|
||||
generation--;
|
||||
}//end of else
|
||||
|
||||
|
||||
writer.close();
|
||||
stream.close();
|
||||
|
||||
} catch (Exception e) {
|
||||
// TODO Auto-generated catch block
|
||||
e.printStackTrace();
|
||||
}
|
||||
return result;
|
||||
}//end of method "clustering"
|
||||
|
||||
/**
|
||||
* output the root cluster name to file
|
||||
* @param fileName output file name
|
||||
* @param rootName thr root cluster name
|
||||
*/
|
||||
public void printRootName(String fileName, String rootName) {
|
||||
try {
|
||||
FileOutputStream stream;// provides file access
|
||||
OutputStreamWriter writer;// writes to the file
|
||||
stream = new FileOutputStream(new File(fileName), true);
|
||||
writer = new OutputStreamWriter(stream);
|
||||
writer.write("root cluster is:" + rootName + "\n");
|
||||
writer.flush();
|
||||
writer.close();
|
||||
stream.close();
|
||||
} catch (FileNotFoundException e) {
|
||||
// TODO Auto-generated catch block
|
||||
e.printStackTrace();
|
||||
} catch (IOException e) {
|
||||
// TODO Auto-generated catch block
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* print out the instance part of the data into a CSV formated table.
|
||||
*
|
||||
* @param data: the printed data set
|
||||
*/
|
||||
public String printTableHead(Instances data) {
|
||||
String strResult = "likelihood";
|
||||
for (int i = 0; i < data.numAttributes(); i++) {
|
||||
String strTemp = "";
|
||||
String[] strArr = data.attribute(i).toString().split("\\ ");
|
||||
for (int j = 1; j < strArr.length - 1; j++) {
|
||||
strTemp += strArr[j];
|
||||
}
|
||||
strResult += "," + strTemp;
|
||||
}
|
||||
|
||||
return strResult + ",cluster origin,cluster ID";
|
||||
}//end of method "printTalbe"
|
||||
|
||||
/**
|
||||
* print the vector
|
||||
* @param vec
|
||||
*/
|
||||
public void printVector(Vector vec, String outputFilename) {
|
||||
//System.out.println("\n***************************");
|
||||
//System.out.println("*** final result ***");
|
||||
//System.out.println("***************************");
|
||||
|
||||
try {
|
||||
FileOutputStream stream;// provides file access
|
||||
OutputStreamWriter writer;// writes to the file
|
||||
stream = new FileOutputStream(new File(outputFilename), false);
|
||||
writer = new OutputStreamWriter(stream);
|
||||
|
||||
for (int i = 0; i < vec.size(); i++) {
|
||||
//System.out.println(vec.elementAt(i));
|
||||
writer.write(vec.elementAt(i).toString() + "\n");
|
||||
}
|
||||
|
||||
writer.close();
|
||||
stream.close();
|
||||
} catch (FileNotFoundException e) {
|
||||
// TODO Auto-generated catch block
|
||||
e.printStackTrace();
|
||||
} catch (IOException e) {
|
||||
// TODO Auto-generated catch block
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
//System.out.println("\n***************************");
|
||||
//System.out.println("*** end of final result ***");
|
||||
//System.out.println("***************************");
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param endNode an leaf node
|
||||
* @return a string contains all the ancestor's name of the node
|
||||
*/
|
||||
public String getAncestor(TreeNode endNode, boolean fromLeafNode) {
|
||||
String strResult = endNode.getStrName();
|
||||
TreeNode tempNode = endNode;
|
||||
while (tempNode.getParent() != null) {
|
||||
tempNode = tempNode.getParent();
|
||||
strResult += "; " + tempNode.getStrName();
|
||||
}
|
||||
if (fromLeafNode) {
|
||||
return strResult;
|
||||
} else {
|
||||
String newResult = "";
|
||||
String[] history = strResult.split("\\;");
|
||||
for (int i = history.length; i > 0; i--) {
|
||||
newResult += history[i - 1] + "; ";
|
||||
}
|
||||
return newResult;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* check the number of the arguments:
|
||||
* java GetCluster arg1 arg2 ...
|
||||
*
|
||||
* @param length the length of the arguments
|
||||
* in this program, length should be 1
|
||||
*/
|
||||
public void checkParameters(int length) {
|
||||
if (length != 1) {
|
||||
System.out.println("Usage: java GetCluster inputFileName");
|
||||
System.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param inputFileName the name of the input file name
|
||||
* @return an Instances of Weka Instances
|
||||
*/
|
||||
public Instances input(String inputFileName) {
|
||||
String[] inputName = inputFileName.split("\\.");
|
||||
Instances oriData = null;
|
||||
try {
|
||||
if (inputName[inputName.length - 1].compareToIgnoreCase("csv") == 0) {
|
||||
// read from csv file
|
||||
readCSV(inputFileName);
|
||||
FileReader f = new FileReader(inputFileName + ".arff");
|
||||
BufferedReader b = new BufferedReader(f);
|
||||
oriData = new Instances(b);
|
||||
} else if (inputName[inputName.length - 1].compareToIgnoreCase("arff") == 0) {
|
||||
// read from arff data
|
||||
FileReader f = new FileReader(inputFileName);
|
||||
BufferedReader b = new BufferedReader(f);
|
||||
oriData = new Instances(b);
|
||||
} else {
|
||||
System.out.println("only .arff or .csv format allowed!");
|
||||
System.exit(1);
|
||||
}
|
||||
} catch (FileNotFoundException e) {
|
||||
// TODO Auto-generated catch block
|
||||
e.printStackTrace();
|
||||
} catch (IOException e) {
|
||||
// TODO Auto-generated catch block
|
||||
e.printStackTrace();
|
||||
}
|
||||
return oriData;
|
||||
}
|
||||
|
||||
/**
|
||||
* read a csv file and convert to a arff file
|
||||
* @param inputName the name of the csv file
|
||||
*/
|
||||
public void readCSV(String inputName) {
|
||||
|
||||
try {
|
||||
FileReader fr = new FileReader(inputName);
|
||||
BufferedReader br = new BufferedReader(fr);
|
||||
FileOutputStream stream;// provides file access
|
||||
OutputStreamWriter writer;// writes to the file
|
||||
stream = new FileOutputStream(new File(inputName + ".arff"), false);
|
||||
writer = new OutputStreamWriter(stream);
|
||||
|
||||
String strLine = br.readLine();
|
||||
String[] varNameArray = strLine.split("\\,");
|
||||
|
||||
writer.write("@RELATION dataset" + "\n\n");
|
||||
for (int i = 0; i < varNameArray.length; i++) {
|
||||
if (i < 2) {
|
||||
writer.write("@ATTRIBUTE" + " " + "\"" + varNameArray[i] + "\"" + " " + "string" + "\n");
|
||||
} else {
|
||||
writer.write("@ATTRIBUTE" + " " + "\"" + varNameArray[i] + "\"" + " " + "numeric" + "\n");
|
||||
}
|
||||
}
|
||||
writer.write("\n@DATA\n");
|
||||
while ((strLine = br.readLine()) != null) {
|
||||
writer.write(strLine + "\n");
|
||||
}
|
||||
|
||||
writer.close();
|
||||
stream.close();
|
||||
fr.close();
|
||||
br.close();
|
||||
|
||||
} catch (FileNotFoundException e) {
|
||||
// TODO Auto-generated catch block
|
||||
e.printStackTrace();
|
||||
} catch (IOException e) {
|
||||
// TODO Auto-generated catch block
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param data
|
||||
* @return an array contains the first element
|
||||
* of each instance of input data
|
||||
*/
|
||||
private String[] getClusterNames(Instances data) {
|
||||
String[] result = new String[data.numInstances()];
|
||||
for (int i = 0; i < result.length; i++) {
|
||||
String[] strArray = data.instance(i).toString().split("\\,");
|
||||
result[i] = strArray[0];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private double[] getGoID(String[] clusterNames, String lookupFile, String backgroundFile, String count) {
|
||||
//********************************
|
||||
// part 2, calculate RawGoID
|
||||
//********************************
|
||||
double[] result = new double[2];
|
||||
|
||||
//initialize local variables:
|
||||
RawGoID myRawGoID = new RawGoID(clusterNames, lookupFile, backgroundFile);
|
||||
double clusterGoid = myRawGoID.getRawGoID();
|
||||
double randomAve = 0.0;
|
||||
double randomStd = 0.0;
|
||||
double zScore = 0.0;
|
||||
|
||||
// System.out.println("real cluster raw GOid =" + clusterGoid);
|
||||
|
||||
// get 'repeat time' random rawGoIDs
|
||||
double[] randomGoid = new double[Integer.parseInt(count)];
|
||||
for (int i = 0; i < Integer.parseInt(count); i++) {
|
||||
randomGoid[i] = myRawGoID.getRandomRawGoID();
|
||||
// System.out.println("now is in loop :" + (i + 1));
|
||||
// System.out.println("randomGOid = " + randomGoid[i]);
|
||||
}
|
||||
|
||||
//calculate
|
||||
randomAve = Stats.getMean(randomGoid);
|
||||
randomStd = Stats.getStdDev(randomGoid);
|
||||
zScore = Stats.getZscore(randomAve, randomStd, clusterGoid);
|
||||
result[0] = clusterGoid;
|
||||
result[1] = zScore;
|
||||
return result;
|
||||
|
||||
}
|
||||
|
||||
private double[] getGoIDFromFunc(String[] clusterNames, String lookupFile, String backgroundFile, String count) {
|
||||
//********************************
|
||||
// part 2, calculate RawGoID
|
||||
//********************************
|
||||
double[] result = new double[2];
|
||||
|
||||
//initialize local variables:
|
||||
RawGoID myRawGoID = new RawGoID(clusterNames, lookupFile, backgroundFile);
|
||||
double clusterGoid = myRawGoID.getRawGoID();
|
||||
double randomAve = 0.0;
|
||||
double randomStd = 0.0;
|
||||
double zScore = 0.0;
|
||||
|
||||
// System.out.println("real cluster raw GOid =" + clusterGoid);
|
||||
|
||||
// get 'repeat time' random rawGoIDs
|
||||
double[] randomGoid = new double[Integer.parseInt(count)];
|
||||
for (int i = 0; i < Integer.parseInt(count); i++) {
|
||||
randomGoid[i] = myRawGoID.getRandomRawGoID();
|
||||
// System.out.println("now is in loop :" + (i + 1));
|
||||
// System.out.println("randomGOid = " + randomGoid[i]);
|
||||
}
|
||||
|
||||
//calculate
|
||||
randomAve = Stats.getMeanFromFunc(myRawGoID.getOriClusterSize());
|
||||
randomStd = Stats.getStdDevFromFunc(myRawGoID.getOriClusterSize());
|
||||
zScore = Stats.getZscore(randomAve, randomStd, clusterGoid);
|
||||
result[0] = clusterGoid;
|
||||
result[1] = zScore;
|
||||
return result;
|
||||
|
||||
}
|
||||
}//end of class
|
||||
|
||||
|
||||
|
||||
final class TreeNode {
|
||||
|
||||
private String strName;
|
||||
private double dLikelihood;
|
||||
private Instances data;
|
||||
private TreeNode parent;
|
||||
// TreeNode child;
|
||||
|
||||
/**
|
||||
* @param strName name of node
|
||||
* @param likelihood likelihood of the data
|
||||
* @param data data set
|
||||
* @param parent point to its parent node
|
||||
* @param child point to its child node
|
||||
*/
|
||||
public TreeNode(String strName, double likelihood, Instances data, TreeNode parent) {
|
||||
this.strName = strName;
|
||||
dLikelihood = likelihood;
|
||||
this.data = data;
|
||||
this.parent = parent;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the data
|
||||
*/
|
||||
public Instances getData() {
|
||||
return data;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param data the data to set
|
||||
*/
|
||||
public void setData(Instances data) {
|
||||
this.data = data;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the dLikelihood
|
||||
*/
|
||||
public double getDLikelihood() {
|
||||
return dLikelihood;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param likelihood the dLikelihood to set
|
||||
*/
|
||||
public void setDLikelihood(double likelihood) {
|
||||
dLikelihood = likelihood;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the parent
|
||||
*/
|
||||
public TreeNode getParent() {
|
||||
return parent;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param parent the parent to set
|
||||
*/
|
||||
public void setParent(TreeNode parent) {
|
||||
this.parent = parent;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the strName
|
||||
*/
|
||||
public String getStrName() {
|
||||
return strName;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param strName the strName to set
|
||||
*/
|
||||
public void setStrName(String strName) {
|
||||
this.strName = strName;
|
||||
}
|
||||
}
|
||||
165
qhtcp-workflow/apps/java/weka-clustering/src/Information.java
Executable file
165
qhtcp-workflow/apps/java/weka-clustering/src/Information.java
Executable file
@@ -0,0 +1,165 @@
|
||||
/*
|
||||
* To change this template, choose Tools | Templates
|
||||
* and open the template in the editor.
|
||||
*/
|
||||
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.FileWriter;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
*
|
||||
* @author DTian
|
||||
*/
|
||||
public class Information {
|
||||
|
||||
/**
|
||||
*
|
||||
* @param key the key of the dictionary
|
||||
* @return the entropy
|
||||
*/
|
||||
public static double entropy(String [] data ) {
|
||||
double entropy = 0;
|
||||
|
||||
// Frequency table
|
||||
HashMap freqDict = new HashMap();
|
||||
int one = 1;
|
||||
|
||||
for(int i=0; i<data.length; i++){
|
||||
String newkey = data[i];
|
||||
if (freqDict.containsKey(newkey)) {
|
||||
int val = Integer.parseInt(freqDict.get(newkey).toString());
|
||||
freqDict.remove(newkey);
|
||||
val = val + 1;
|
||||
freqDict.put(newkey, val + "");
|
||||
} else {
|
||||
freqDict.put(newkey, (one + ""));
|
||||
}
|
||||
}
|
||||
|
||||
// Probability table
|
||||
HashMap probDict = new HashMap();
|
||||
Iterator it = freqDict.keySet().iterator();
|
||||
String newkey = "";
|
||||
while (it.hasNext()) {
|
||||
newkey = (String) it.next();
|
||||
double value = 0.0;
|
||||
value = Double.parseDouble((String) freqDict.get(newkey)) / data.length;
|
||||
probDict.put(newkey, value + "");
|
||||
}
|
||||
|
||||
// Calculate entropy
|
||||
it = probDict.keySet().iterator();
|
||||
while (it.hasNext()) {
|
||||
newkey = (String) it.next();
|
||||
double value = 0.0;
|
||||
value = Double.parseDouble((String) probDict.get(newkey));
|
||||
entropy = entropy - value * (Math.log(value) / Math.log(2));
|
||||
}
|
||||
return entropy;
|
||||
}
|
||||
|
||||
public static double relativeEntropy(String[] data1, String[] data2) {
|
||||
|
||||
double result = 0;
|
||||
// System.out.println(data1.length);
|
||||
|
||||
// Frequency table
|
||||
HashMap freqDict1 = new HashMap();
|
||||
int one = 1;
|
||||
for(int i=0; i<data1.length; i++){
|
||||
Object key = data1[i];
|
||||
if(freqDict1.containsKey(key)){
|
||||
int val = Integer.parseInt( freqDict1.get(key).toString());
|
||||
//freqDict1.remove(key);
|
||||
val++;
|
||||
freqDict1.put(key, val + "");
|
||||
} else {
|
||||
freqDict1.put(key, (one + ""));
|
||||
}
|
||||
}
|
||||
|
||||
// toFileHM(freqDict1, "FreqDict1.txt");
|
||||
HashMap freqDict2 = new HashMap();
|
||||
for (int i=0; i<data2.length; i++) {
|
||||
Object key = data2[i];
|
||||
if (freqDict2.containsKey(key)) {
|
||||
int val = Integer.parseInt(freqDict2.get(key).toString());
|
||||
//freqDict2.remove(key);
|
||||
val++;
|
||||
freqDict2.put(key, val + "");
|
||||
} else {
|
||||
freqDict2.put(key, (one + ""));
|
||||
}
|
||||
}
|
||||
|
||||
// Probability table
|
||||
HashMap<Object, Object> probDict1 = new HashMap<Object, Object>();
|
||||
HashMap<Object, Object> probDict2 = new HashMap<Object, Object>();
|
||||
Iterator it = freqDict1.keySet().iterator();
|
||||
while (it.hasNext()) {
|
||||
Object newkey = it.next();
|
||||
double value = 0;
|
||||
value = Double.parseDouble((String) freqDict1.get(newkey)) / data1.length;
|
||||
probDict1.put(newkey, value + "");
|
||||
}
|
||||
|
||||
it = freqDict2.keySet().iterator();
|
||||
while (it.hasNext()) {
|
||||
Object newkey = it.next();
|
||||
double value = 0;
|
||||
value = Double.parseDouble((String) freqDict2.get(newkey)) / data2.length;
|
||||
probDict2.put(newkey, value + "");
|
||||
}
|
||||
|
||||
// Calculate the relative entropy
|
||||
it = probDict1.keySet().iterator();
|
||||
while (it.hasNext()) {
|
||||
Object newkey = it.next();
|
||||
Object value1 = probDict1.get(newkey);
|
||||
//Object value2 = probDict2.get(newkey);
|
||||
double dValue1 = Double.parseDouble(probDict1.get(newkey).toString());
|
||||
double dValue2 = Double.parseDouble(probDict2.get(newkey).toString());
|
||||
if ( value1.toString().trim().compareToIgnoreCase("1.0") == 0) {
|
||||
result = result + dValue1 * (Math.log(dValue1/dValue2) / Math.log(2));
|
||||
} else if (value1.toString().trim().compareToIgnoreCase("0") == 0){
|
||||
result = result + (1-dValue1) * (Math.log((1-dValue1)/(1-dValue2)) / Math.log(2));
|
||||
} else {
|
||||
result = result + dValue1 * (Math.log(dValue1/dValue2) / Math.log(2));
|
||||
result = result + (1-dValue1) * (Math.log((1-dValue1)/(1-dValue2)) / Math.log(2));
|
||||
}
|
||||
// toFile(result+"", "probDict1.txt");
|
||||
// toFile(result, "resultsOfresult.txt");//check point by Jingyu
|
||||
}
|
||||
//toFile(probDict1.size()+ "*******************", "probDict1.txt");
|
||||
//System.out.println("relative entropy = " + result);
|
||||
return result;
|
||||
}
|
||||
|
||||
private static void toFile(String data, String filename) {
|
||||
|
||||
// Output to file
|
||||
try {
|
||||
BufferedWriter writer = new BufferedWriter(new FileWriter(filename,true));
|
||||
writer.write(data + "\n");
|
||||
writer.close();
|
||||
} catch (Exception e) {
|
||||
System.err.println(e.getStackTrace());
|
||||
}
|
||||
}
|
||||
|
||||
private static void toFileHM(HashMap data, String filename) {
|
||||
|
||||
// Output to file
|
||||
try {
|
||||
BufferedWriter writer = new BufferedWriter(new FileWriter(filename, true));
|
||||
for (Object key : data.keySet()) {
|
||||
writer.write(key.toString() +":"+ data.get(key)+"\n");
|
||||
}
|
||||
writer.close();
|
||||
} catch (Exception e) {
|
||||
System.err.println(e.getStackTrace());
|
||||
}
|
||||
}
|
||||
}
|
||||
130
qhtcp-workflow/apps/java/weka-clustering/src/Matrix.java
Executable file
130
qhtcp-workflow/apps/java/weka-clustering/src/Matrix.java
Executable file
@@ -0,0 +1,130 @@
|
||||
/*
|
||||
* To change this template, choose Tools | Templates
|
||||
* and open the template in the editor.
|
||||
*/
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
*
|
||||
* @author DTian
|
||||
*/
|
||||
public class Matrix {
|
||||
|
||||
private HashMap matrix; // store data
|
||||
private int rowSize; // row size of matrix
|
||||
private int colSize; // column size of value array
|
||||
private final int lookupTableSize = 9000; //size of look up table
|
||||
|
||||
public int getColSize() {
|
||||
return colSize;
|
||||
}
|
||||
|
||||
public void setColSize(int colSize) {
|
||||
this.colSize = colSize;
|
||||
}
|
||||
|
||||
public HashMap getMatrix() {
|
||||
return matrix;
|
||||
}
|
||||
|
||||
public void setMatrix(HashMap matrix) {
|
||||
this.matrix = matrix;
|
||||
}
|
||||
|
||||
public int getRowSize() {
|
||||
return rowSize;
|
||||
}
|
||||
|
||||
public void setRowSize(int rowSize) {
|
||||
this.rowSize = rowSize;
|
||||
}
|
||||
|
||||
public Matrix() {
|
||||
rowSize = 0;
|
||||
colSize = 0;
|
||||
matrix = new HashMap();
|
||||
}
|
||||
|
||||
/**
|
||||
* constructor with 1 String parameter
|
||||
*
|
||||
* @param filename : the name of the input file
|
||||
*
|
||||
* @result: create a matrix from a input file
|
||||
*
|
||||
*/
|
||||
public Matrix(String filename) {
|
||||
|
||||
// Initialize variables
|
||||
this.setRowSize(0);
|
||||
this.setColSize(0);
|
||||
matrix = new HashMap(lookupTableSize);
|
||||
|
||||
try {
|
||||
|
||||
FileReader fr = new FileReader(filename);
|
||||
BufferedReader br = new BufferedReader(fr);
|
||||
|
||||
// strRow is used to read line from file(skip first row)
|
||||
String strRow = br.readLine();
|
||||
|
||||
// The while loop read the data from data file to vvf
|
||||
while ((strRow = br.readLine()) != null) {
|
||||
|
||||
// strArray was used to store the float value from data file in
|
||||
// string format
|
||||
String delimiter = "";
|
||||
if (strRow.indexOf(",") >= 0) { //for CSV file
|
||||
delimiter = "\\,";
|
||||
} else { // for whitespace delimited file
|
||||
delimiter = "\\s";
|
||||
}
|
||||
|
||||
String[] strArray = strRow.trim().split(delimiter);
|
||||
String[] strArrValue = Arrays.copyOfRange(strArray, 1, strArray.length);
|
||||
// strArray[0] is the orf name, others are value
|
||||
matrix.put(strArray[0].trim().toLowerCase(), strArrValue);
|
||||
rowSize++;
|
||||
colSize = strArrValue.length;
|
||||
}
|
||||
|
||||
br.close();
|
||||
fr.close();
|
||||
} catch (IOException e) {
|
||||
// catch possible io errors from readLine()
|
||||
System.out.println("IOException error in 'class Matrix, constructor'");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param index, the specifed key
|
||||
* @return: the string array of the value
|
||||
*/
|
||||
public String[] getSpecifiedValue(Object key) {
|
||||
return (String[]) matrix.get(key);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the list of orf names
|
||||
*/
|
||||
public ArrayList getOrfNames() {
|
||||
ArrayList result = new ArrayList(this.getRowSize());
|
||||
Iterator it = matrix.keySet().iterator();
|
||||
while (it.hasNext()) {
|
||||
result.add(it.next());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public void addValue(Object key, Object value) {
|
||||
matrix.put(key, value);
|
||||
}
|
||||
}
|
||||
375
qhtcp-workflow/apps/java/weka-clustering/src/RawGoID.java
Executable file
375
qhtcp-workflow/apps/java/weka-clustering/src/RawGoID.java
Executable file
@@ -0,0 +1,375 @@
|
||||
/*
|
||||
* the input: 3 files, 1 is cluster file, 2 is Go matrix file (lookup table)
|
||||
* 3 is back ground file(pool)
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.FileReader;
|
||||
import java.io.FileWriter;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Random;
|
||||
|
||||
/**
|
||||
*
|
||||
* @author DTian
|
||||
*/
|
||||
public class RawGoID {
|
||||
|
||||
private ArrayList clusterGeneList; // for the input cluster file
|
||||
private Matrix poolTable; //for the filtered gene pool list
|
||||
private Matrix lookupTable; // for the lookup attribute table
|
||||
private int oriClusterSize; //for the original cluster size
|
||||
private ArrayList oriPoolOrfsName;//for the complete list of pool table
|
||||
// private String randomFilename;
|
||||
|
||||
public Matrix getLookupTable() {
|
||||
return lookupTable;
|
||||
}
|
||||
|
||||
public void setLookupTable(Matrix lookupTable) {
|
||||
this.lookupTable = lookupTable;
|
||||
}
|
||||
|
||||
public Matrix getPoolTable() {
|
||||
return poolTable;
|
||||
}
|
||||
|
||||
public void setPoolTable(Matrix poolTable) {
|
||||
this.poolTable = poolTable;
|
||||
}
|
||||
|
||||
public ArrayList getClusterGeneList() {
|
||||
return clusterGeneList;
|
||||
}
|
||||
|
||||
public void setClusterGeneList(ArrayList clusterGeneList) {
|
||||
this.clusterGeneList = clusterGeneList;
|
||||
}
|
||||
|
||||
public RawGoID() {
|
||||
clusterGeneList = new ArrayList();
|
||||
poolTable = new Matrix();
|
||||
lookupTable = new Matrix();
|
||||
// randomFilename ="";
|
||||
}
|
||||
|
||||
public void setOriClusterSize(int oriClusterSize) {
|
||||
this.oriClusterSize = oriClusterSize;
|
||||
}
|
||||
|
||||
public int getOriClusterSize () {
|
||||
return oriClusterSize;
|
||||
}
|
||||
|
||||
public void setOriPoolOrfsName(ArrayList oriPoolOrfsName) {
|
||||
this.oriPoolOrfsName = oriPoolOrfsName;
|
||||
}
|
||||
|
||||
public ArrayList getOriPoolOrfsName() {
|
||||
return oriPoolOrfsName;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param clusterFilename : cluster Filename
|
||||
* @param GoMatrixFilename : GoMatrix Filename
|
||||
* @param backGroundFilename : backGround Filename
|
||||
*/
|
||||
public RawGoID(String clusterFilename, String GoMatrixFilename, String backGroundFilename) {
|
||||
try {
|
||||
clusterGeneList = new ArrayList(200);
|
||||
ArrayList refClusterGeneList = new ArrayList (200);
|
||||
|
||||
// Get the smallGeneList (a cluster )
|
||||
BufferedReader br = new BufferedReader(new FileReader(clusterFilename));
|
||||
|
||||
// strRow is used to read line from file
|
||||
String strRow = "";
|
||||
while ((strRow = br.readLine()) != null) {
|
||||
clusterGeneList.add(strRow.trim().toLowerCase());
|
||||
}
|
||||
// System.out.println(clusterGeneList.size());
|
||||
setOriClusterSize(clusterGeneList.size());
|
||||
// System.out.println("original cluster size =" + clusterGeneList.size());
|
||||
|
||||
// Get the mtrix (lookup table)
|
||||
lookupTable = new Matrix(GoMatrixFilename);
|
||||
|
||||
// Get the bigGeneList (pool or back ground file)
|
||||
br = new BufferedReader(new FileReader(backGroundFilename));
|
||||
|
||||
ArrayList poolOrfsName = new ArrayList(5000);
|
||||
while ((strRow = br.readLine()) != null) {
|
||||
poolOrfsName.add(strRow.trim().toLowerCase());
|
||||
}
|
||||
this.setOriPoolOrfsName(poolOrfsName);
|
||||
poolTable = new Matrix();
|
||||
for (int i = 0; i < poolOrfsName.size(); i++) {
|
||||
Object tempKey = poolOrfsName.get(i);
|
||||
if (lookupTable.getMatrix().containsKey(tempKey)) {
|
||||
poolTable.addValue(tempKey, lookupTable.getMatrix().get(tempKey));
|
||||
}
|
||||
}
|
||||
poolTable.setRowSize(poolTable.getMatrix().size());
|
||||
poolTable.setColSize(lookupTable.getColSize());
|
||||
br.close();
|
||||
|
||||
// This loop is to take out any ORF from the cluster gene list if not exist in pool table
|
||||
// not necessary if all cluster ORFs are from pool table
|
||||
for (int i=0;i<refClusterGeneList.size();i++){
|
||||
Object tempKey = clusterGeneList.get(i);
|
||||
if (!poolTable.getMatrix().containsKey(tempKey)){
|
||||
clusterGeneList.remove(i);
|
||||
}
|
||||
}
|
||||
|
||||
// System.out.println("length of real cluster gene List after filtering = " + clusterGeneList.size());
|
||||
// Check point
|
||||
// System.out.println(clusterGeneList);
|
||||
} catch (IOException e) {
|
||||
// Catch possible io errors from readLine()
|
||||
System.out.println("IOException error in 'class GetGoID, constructor'");
|
||||
}
|
||||
|
||||
// Checkpoint
|
||||
// System.out.println("Column size of pooltable is:"+ poolTable.getColSize() +'\t'+ "Row size of pooltable is :"+ poolTable.getRowSize());
|
||||
// randomFilename = "randomOrfName.txt";
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param clusterFilename : cluster Filename
|
||||
* @param GoMatrixFilename : GoMatrix Filename
|
||||
* @param backGroundFilename : backGround Filename
|
||||
*/
|
||||
public RawGoID(String[] clusterName, String GoMatrixFilename, String backGroundFilename) {
|
||||
try {
|
||||
clusterGeneList = new ArrayList(clusterName.length);
|
||||
ArrayList refClusterGeneList = new ArrayList (200);
|
||||
|
||||
// Get the smallGeneList (a cluster )
|
||||
for(String name: clusterName){
|
||||
clusterGeneList.add(name.trim().toLowerCase());
|
||||
}
|
||||
|
||||
// System.out.println(clusterGeneList.size());
|
||||
setOriClusterSize(clusterGeneList.size());
|
||||
// System.out.println("original cluster size =" + clusterGeneList.size());
|
||||
|
||||
// Get the mtrix (lookup table)
|
||||
lookupTable = new Matrix(GoMatrixFilename);
|
||||
|
||||
// Get the bigGeneList (pool or back ground file)
|
||||
BufferedReader br = new BufferedReader(new FileReader(backGroundFilename));
|
||||
ArrayList poolOrfsName = new ArrayList(5000);
|
||||
String strRow = "";
|
||||
while ((strRow = br.readLine()) != null) {
|
||||
poolOrfsName.add(strRow.trim().toLowerCase());
|
||||
}
|
||||
this.setOriPoolOrfsName(poolOrfsName);
|
||||
poolTable = new Matrix();
|
||||
for (int i = 0; i < poolOrfsName.size(); i++) {
|
||||
Object tempKey = poolOrfsName.get(i);
|
||||
if(lookupTable.getMatrix().containsKey(tempKey)){
|
||||
poolTable.addValue(tempKey, lookupTable.getMatrix().get(tempKey));
|
||||
}
|
||||
}
|
||||
poolTable.setRowSize(poolTable.getMatrix().size());
|
||||
poolTable.setColSize(lookupTable.getColSize());
|
||||
br.close();
|
||||
|
||||
// This loop is to take out any ORF from the cluster gene list if not exist in pool table
|
||||
// not necessary if all cluster ORFs are from pool table
|
||||
for (int i=0;i<refClusterGeneList.size();i++){
|
||||
Object tempKey = clusterGeneList.get(i);
|
||||
if (!poolTable.getMatrix().containsKey(tempKey)){
|
||||
clusterGeneList.remove(i);
|
||||
}
|
||||
}
|
||||
|
||||
// System.out.println("length of real cluster gene List after filtering = " + clusterGeneList.size());
|
||||
// Checkpoint
|
||||
// System.out.println(clusterGeneList);
|
||||
} catch (IOException e) {
|
||||
// Catch possible io errors from readLine()
|
||||
System.out.println("IOException error in 'class GetGoID, constructor'");
|
||||
}
|
||||
|
||||
// Checkpoint
|
||||
// System.out.println("Column size of pooltable is:"+ poolTable.getColSize() +'\t'+ "Row size of pooltable is :"+ poolTable.getRowSize());
|
||||
// randomFilename = "randomOrfName.txt";
|
||||
}
|
||||
|
||||
public double getRawGoID() {
|
||||
double result = 0.0;
|
||||
ArrayList fullMatrix = new ArrayList(this.getPoolTable().getRowSize());
|
||||
ArrayList subMatrix = new ArrayList(this.getClusterGeneList().size());
|
||||
|
||||
// Fill the fullMatrix with pool table data
|
||||
Iterator it = this.getPoolTable().getMatrix().keySet().iterator();
|
||||
while (it.hasNext()) {
|
||||
Object key = it.next();
|
||||
fullMatrix.add(this.getPoolTable().getMatrix().get(key.toString().toLowerCase()));
|
||||
}
|
||||
// System.out.println("size of fullMatrix is:"+ fullMatrix.size());
|
||||
|
||||
// Fill the subMatrix with lookup table data and cluster information
|
||||
for (Object element : this.getClusterGeneList()) {
|
||||
if (this.getLookupTable().getMatrix().containsKey(element)) {
|
||||
subMatrix.add(this.getLookupTable().getMatrix().get(element.toString().toLowerCase()));
|
||||
}
|
||||
}
|
||||
// System.out.println("size of subMatrix is:"+ subMatrix.size());
|
||||
|
||||
// Transpose the 2 matrix
|
||||
ArrayList attrByOrfFullMatrix = this.transpose(fullMatrix);
|
||||
ArrayList attrByOrfSubMatrix = this.transpose(subMatrix);
|
||||
|
||||
// System.out.println("size of transposed fullMatrix is:"+ attrByOrfFullMatrix.size());
|
||||
// System.out.println("size of transposed subMatrix is:"+ attrByOrfSubMatrix.size());
|
||||
|
||||
// Calculate the raw GoID
|
||||
for (int i = 0; i < attrByOrfFullMatrix.size(); i++) {
|
||||
// Added by tdh, from the source code, we need not do this step
|
||||
int nonZeroCount = 0;
|
||||
String[] tempArray = (String[]) attrByOrfFullMatrix.get(i);
|
||||
for (int j = 0; j < tempArray.length; j++) {
|
||||
// System.out.println(Integer.parseInt(tempArray[j].trim()));
|
||||
if (tempArray[j].trim().compareToIgnoreCase("1")==0 ) {
|
||||
// System.out.println(Integer.parseInt(tempArray[j].trim()));
|
||||
// System.out.println(tempArray[j].trim().compareToIgnoreCase("1"));//Jingyu added for checking
|
||||
nonZeroCount++;
|
||||
break; // added by tdh, from the source code, we need not do this step//Jingyu notes: the break may help the code running faster.
|
||||
// System.out.println("nonzeroCount =" + nonZeroCount);//Jingyu add
|
||||
}
|
||||
}
|
||||
// System.out.println("nonzeroCount =" + nonZeroCount);//Jingyu add
|
||||
if (nonZeroCount >= 0) {
|
||||
result = result + Information.relativeEntropy(
|
||||
((String[]) attrByOrfSubMatrix.get(i)),
|
||||
(String[]) (attrByOrfFullMatrix.get(i)));
|
||||
}
|
||||
// System.out.println(Information.relativeEntropy(
|
||||
// ((String[]) attrByOrfSubMatrix.get(i)), (String[]) (attrByOrfFullMatrix.get(i))));
|
||||
}
|
||||
// System.out.println("result =" + result);
|
||||
return result;
|
||||
}
|
||||
|
||||
private ArrayList transpose(ArrayList data) {
|
||||
ArrayList result = new ArrayList(data.size());
|
||||
// Do transpose here
|
||||
int rowSize = data.size();
|
||||
int colSize = ((String[]) data.get(0)).length;
|
||||
|
||||
String[][] matrix = new String[colSize][rowSize];
|
||||
for (int i = 0; i < rowSize; i++) {
|
||||
String[] temp = (String[]) data.get(i);
|
||||
for (int j = 0; j < colSize; j++) {
|
||||
// System.out.println("j is : " + j);
|
||||
matrix[j][i] = temp[j];
|
||||
}
|
||||
}
|
||||
|
||||
// Convert to ArrayList
|
||||
for (int i = 0; i < matrix.length; i++) {
|
||||
result.add(matrix[i]);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public double getRandomRawGoID() {
|
||||
double result = 0.0;
|
||||
this.setClusterGeneList(this.getRandomCluster(this.getOriClusterSize()));
|
||||
result = this.getRawGoID();
|
||||
if (Double.isNaN(result)) {
|
||||
return getRandomRawGoID();
|
||||
} else {
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
private void toFile(HashMap data, String filename) {
|
||||
|
||||
// Output to a file
|
||||
try {
|
||||
BufferedWriter writer = new BufferedWriter(new FileWriter(filename));
|
||||
for (Object key : data.keySet()) {
|
||||
writer.write(key.toString() + "\n");
|
||||
}
|
||||
writer.close();
|
||||
} catch (Exception e) {
|
||||
System.err.println(e.getStackTrace());
|
||||
}
|
||||
|
||||
}
|
||||
private static void toFileString(String data, String filename) {
|
||||
|
||||
// Output to a file
|
||||
try {
|
||||
BufferedWriter writer = new BufferedWriter(new FileWriter(filename,true));
|
||||
writer.write(data + "\n");
|
||||
writer.close();
|
||||
} catch (Exception e) {
|
||||
System.err.println(e.getStackTrace());
|
||||
}
|
||||
}
|
||||
|
||||
private ArrayList getRandomCluster(int clusterSize) {
|
||||
ArrayList<String> result = new ArrayList(clusterSize);
|
||||
|
||||
// Jingyu: The following segment of code, which is deactivated, is designed to get the random cluster list from a lookuptable-filtered pooltable
|
||||
// Jingyu: To do so may cause a bias in average of random raw GOid score by using a smaller pool list
|
||||
// get a random cluster with same size of the cluster file and then calculate the Goid
|
||||
// 1, get the random orf names to a ArrayList
|
||||
|
||||
// HashMap hm = new HashMap(this.getClusterGeneList().size());
|
||||
// while (hm.keySet().size() < clusterSize) {
|
||||
// hm.put(this.getPoolTable().getOrfNames().get(randInt(this.getPoolTable().getOrfNames().size())), "0");
|
||||
// }
|
||||
// result.addAll(hm.keySet());
|
||||
|
||||
// Get a random cluster with same size of the cluster file from the original ORF pool
|
||||
// Extra step added by Jingyu to remove the ORFs not existing in pooltable;
|
||||
ArrayList localOriPoolTable = new ArrayList();
|
||||
localOriPoolTable = this.getOriPoolOrfsName();
|
||||
|
||||
// Checkpoint
|
||||
// System.out.println(localOriPoolTable.size());
|
||||
for (int i=0;i<clusterSize;i++){
|
||||
result.add(localOriPoolTable.get(randInt(localOriPoolTable.size())).toString().trim().toLowerCase());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param max the max integer you want to generate
|
||||
*
|
||||
* @return : a random integar between 0 and max
|
||||
*/
|
||||
private int randInt(int max) {
|
||||
Random r = new Random((int) (System.nanoTime()));
|
||||
int random = r.nextInt();
|
||||
random = Math.abs(random);
|
||||
random = random % max;
|
||||
// random += 1;
|
||||
return random;
|
||||
}
|
||||
|
||||
// This method is not used for the final code.
|
||||
// private String [] getZeroStringArray(int length) {
|
||||
// String [] tmpStrArray = new String[length];
|
||||
// for (int j=0; j<tmpStrArray.length; j++) {
|
||||
// tmpStrArray[j] = "0";
|
||||
// }
|
||||
// return tmpStrArray;
|
||||
// }
|
||||
}
|
||||
|
||||
155
qhtcp-workflow/apps/java/weka-clustering/src/SGD2AttrTable.java
Executable file
155
qhtcp-workflow/apps/java/weka-clustering/src/SGD2AttrTable.java
Executable file
@@ -0,0 +1,155 @@
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
|
||||
|
||||
/*
|
||||
* This program starts by creating an intermediate table and then will load the function from Dr. Brett McKinney to create the attribute table.
|
||||
*/
|
||||
|
||||
/**
|
||||
*
|
||||
* @author DTian
|
||||
*/
|
||||
public class SGD2AttrTable {
|
||||
|
||||
public void createIntermediateTable(String inputFile, String outputFile) {
|
||||
HashMap geneToGODict = new HashMap();
|
||||
try {
|
||||
|
||||
FileReader fr = new FileReader(inputFile);
|
||||
BufferedReader br = new BufferedReader(fr);
|
||||
|
||||
// strRow is used to read line from file(skip first row)
|
||||
String strRow = br.readLine();
|
||||
|
||||
// The while loop read the data from data file to vvf
|
||||
while ((strRow = br.readLine()) != null) {
|
||||
|
||||
// Check: skip the line if it is a comment line
|
||||
if (strRow.trim().charAt(0) != 'S' ) {
|
||||
continue;
|
||||
}
|
||||
String [] strArray = strRow.trim().split("\\t");
|
||||
String key = toKey(strArray[10].toUpperCase());
|
||||
if (key.compareToIgnoreCase("") == 0) {
|
||||
continue;
|
||||
}
|
||||
String value = toValue(strArray[4]);
|
||||
if (geneToGODict.containsKey(key)) {
|
||||
geneToGODict.put(key, geneToGODict.get(key)+ "\t" + value);
|
||||
} else {
|
||||
geneToGODict.put(key, value);
|
||||
}
|
||||
}
|
||||
|
||||
br.close();
|
||||
fr.close();
|
||||
|
||||
// Write to output file
|
||||
FileOutputStream stream; // provides file access
|
||||
OutputStreamWriter writer; // writes to the file
|
||||
stream = new FileOutputStream(new File(outputFile), true);
|
||||
writer = new OutputStreamWriter(stream);
|
||||
Iterator it = geneToGODict.keySet().iterator();
|
||||
while(it.hasNext()){
|
||||
String key = it.next().toString();
|
||||
String value = geneToGODict.get(key).toString();
|
||||
writer.write(key + "\t" + value + "\n");
|
||||
}
|
||||
writer.flush();
|
||||
writer.close();
|
||||
stream.close();
|
||||
} catch (IOException e) {
|
||||
// Catch possible io errors from readLine()
|
||||
System.out.println("IOException error in 'class SGD2AttrTable, method createIntermediateTable'");
|
||||
}
|
||||
}
|
||||
|
||||
public void createAttrTable(String intermediaFile, String outputFile){
|
||||
HashMap geneToGODict = new HashMap();
|
||||
try {
|
||||
|
||||
FileReader fr = new FileReader(intermediaFile);
|
||||
BufferedReader br = new BufferedReader(fr);
|
||||
|
||||
// strRow is used to read line from file(skip first row)
|
||||
String strRow = br.readLine();
|
||||
|
||||
// The while loop read the data from data file to vvf
|
||||
while ((strRow = br.readLine()) != null) {
|
||||
|
||||
//check: skip the line if it is a comment line
|
||||
if (strRow.trim().charAt(0) != 'S' ) {
|
||||
continue;
|
||||
}
|
||||
String [] strArray = strRow.trim().split("\\t");
|
||||
String key = toKey(strArray[10].toUpperCase());
|
||||
if (key.compareToIgnoreCase("") == 0) {
|
||||
continue;
|
||||
}
|
||||
String value = toValue(strArray[4]);
|
||||
if (geneToGODict.containsKey(key)) {
|
||||
geneToGODict.put(key, geneToGODict.get(key)+ "\t" + value);
|
||||
} else {
|
||||
geneToGODict.put(key, value);
|
||||
}
|
||||
}
|
||||
|
||||
br.close();
|
||||
fr.close();
|
||||
|
||||
// Write to output file
|
||||
FileOutputStream stream; // provides file access
|
||||
OutputStreamWriter writer; // writes to the file
|
||||
stream = new FileOutputStream(new File(outputFile), true);
|
||||
writer = new OutputStreamWriter(stream);
|
||||
Iterator it = geneToGODict.keySet().iterator();
|
||||
while (it.hasNext()) {
|
||||
String key = it.next().toString();
|
||||
String value = geneToGODict.get(key).toString();
|
||||
writer.write(key + "\t" + value + "\n");
|
||||
}
|
||||
writer.flush();
|
||||
writer.close();
|
||||
stream.close();
|
||||
} catch (IOException e) {
|
||||
// Catch possible io errors from readLine()
|
||||
System.out.println("IOException error in 'class SGD2AttrTable, method createIntermediateTable'");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param raw the string need to be get rid of the "GO:0s"
|
||||
* @return the string without "GO:00"
|
||||
*/
|
||||
private String toValue(String raw) {
|
||||
String result = raw.toUpperCase(); //raw should be like: "GO:0005739"
|
||||
// Delete "GO:"
|
||||
result = result.substring(3);
|
||||
// Delete "lead zeros"
|
||||
while (result.charAt(0) == '0') {
|
||||
result =result.substring(1);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private String toKey(String raw) {
|
||||
String result = raw.toUpperCase(); // raw should be like: "GO:0005739"
|
||||
// Find the '|'
|
||||
int end = result.indexOf('|');
|
||||
// Get the sub string
|
||||
if (end < 0) {
|
||||
return result;
|
||||
} else {
|
||||
return result.substring(0, end);
|
||||
}
|
||||
}
|
||||
}
|
||||
75
qhtcp-workflow/apps/java/weka-clustering/src/Stats.java
Executable file
75
qhtcp-workflow/apps/java/weka-clustering/src/Stats.java
Executable file
@@ -0,0 +1,75 @@
|
||||
/*
|
||||
|
||||
* def stats(self,r):
|
||||
#returns the average, standard deviation, and min of a sequence
|
||||
tot = sum(r)
|
||||
ave = tot/len(r)
|
||||
sdsq = sum([(i-ave)**2 for i in r])
|
||||
s = list(r)
|
||||
s.sort()
|
||||
#median = s[len(s)//2]
|
||||
return ave, (sdsq/(len(r)-1 or 1))**.5
|
||||
|
||||
def zscore(self,pop_mean,pop_std,raw_goid):
|
||||
return (raw_goid - pop_mean)/pop_std
|
||||
*/
|
||||
|
||||
/**
|
||||
*
|
||||
* @author DTian
|
||||
*/
|
||||
public class Stats {
|
||||
|
||||
/**
|
||||
*
|
||||
* @param data the double array
|
||||
* @return the stand deviation of the array
|
||||
*/
|
||||
public static double getStdDev(double[] data) {
|
||||
double result = 0.0;
|
||||
double ave = getMean(data);
|
||||
for (double d : data) {
|
||||
result += Math.pow((d-ave), 2);
|
||||
}
|
||||
if (data.length>1) {
|
||||
return Math.sqrt(result/(data.length-1));
|
||||
} else {
|
||||
return Math.sqrt(result/1);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param data the double array
|
||||
* @return the mean of the double array.
|
||||
*/
|
||||
public static double getMean(double[] data) {
|
||||
double result = 0.0;
|
||||
for (double d : data) {
|
||||
result += d;
|
||||
}
|
||||
return (result/data.length);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param size the size of ori cluster File
|
||||
* @return the mean of the double array.
|
||||
*/
|
||||
public static double getMeanFromFunc(int size) {
|
||||
return ( -4.8616 + 71.1806/Math.pow(size, 0.33511));
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param size the size of ori cluster File
|
||||
* @return the mean of the double array.
|
||||
*/
|
||||
public static double getStdDevFromFunc(int size) {
|
||||
return ( -0.04943 + 56.634/Math.pow(size, 0.89384));
|
||||
}
|
||||
|
||||
public static double getZscore(double popMean, double popStd, double rawGoid) {
|
||||
return (rawGoid - popMean)/popStd;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user