Squashed initial commit

This commit is contained in:
2024-09-10 13:47:29 -04:00
commit 8ebb6ad265
6221 changed files with 2512206 additions and 0 deletions

View File

@@ -0,0 +1,102 @@
/*
* This is the main class to execute the computation and get the GOID.
*/
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.util.Vector;
import weka.core.Instances;
/**
*
* @author DTian
*/
public class ExecMain {
public static void main(String[] args) {
// SGD2AttrTable sgd = new SGD2AttrTable();
// Get clusters
try {
// Check the input arguments
checkParameters(args.length);
GetClusters myGetClusters = new GetClusters();
// To get the input file type
Instances oriData = myGetClusters.input(args[0]);
// Get the output files name
String outputWholeCluster = args[0] + "-WholeTree.txt";
String outputFinalTable = args[0] + "-finalTable.csv";
String outputSummary = args[0] + "-summary.csv";
boolean fromFunction = true?false:args[4].trim().compareToIgnoreCase("true") == 0;
// Create the root cluster name
int round = 1; //for tree level
int position = 0; // for node position in same level
String rootName = (round-1)+"-"+position+"-"+0;
// System.out.println("root cluster is:" + rootName);
myGetClusters.printRootName(outputWholeCluster,rootName);
// Create a vector for fianl table
Vector vecFinalTable = new Vector();
Vector vecSummary = new Vector();
//get the variable name
vecFinalTable.addElement(myGetClusters.printTableHead(oriData));
// Create the root node
TreeNode root = new TreeNode(rootName,0.0,oriData,null);
OutputStreamWriter xmlWriter = new OutputStreamWriter(
new FileOutputStream(new File("tree.xml"), true));
xmlWriter.write(" <tree>\n <declarations> \n <attributeDecl name=\"name\" type=\"String\"/>\n </declarations>\n");
xmlWriter.write(" <branch>\n <attribute name=\"name\" value=\""+"root" +"\"/>\n");
xmlWriter.flush();
// Recursive clustering the data
myGetClusters.clustering(
root,
round,
""+position,
vecFinalTable,
vecSummary,
outputWholeCluster,
xmlWriter,
args[1],
args[2],
args[3],
fromFunction
);
xmlWriter.write(" </branch>\n");
xmlWriter.write("</tree>\n");
xmlWriter.close();
// Output final result
myGetClusters.printVector(vecFinalTable,outputFinalTable);
myGetClusters.printVector(vecSummary,outputSummary);
} catch (Exception e) {
e.printStackTrace();
System.exit(1);
}
}
/**
* check the number of the arguments:
* java GetCluster arg1 arg2 ...
*
* @param length the length of the arguments
* in this program, length should be 1
*/
private static void checkParameters(int length) {
if(length != 5) {
// there are 5 parameters, 1,file for clustering; 2, lookup table file;
// 3, backgroud file; 4, repeat counts(an integer) 5, GoIDfromFunction (boolean)
System.out.println("Usage: java ExecMain clusterFileName GoMatrixFilename backGroundFilename repeatTime true|false");
System.exit(1);
}
}
}

View File

@@ -0,0 +1,532 @@
/**
* This program will take an input file(either in arff format or csv format).
* outout 3 files: one is the tree structure. another is the final table with
* all information, the last one is the summary information
*
*/
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.Vector;
import weka.clusterers.ClusterEvaluation;
import weka.clusterers.EM;
import weka.core.Instances;
public class GetClusters {
public GetClusters() {
}
/**
*
* @param root the tree node we need to cluster
* @param generation the depth of the tree
* @param position the breadth of the tree
* @param vecFinalTable contain the final table
*/
public int clustering(TreeNode root, int generation,
String position, Vector vecFinalTable,
Vector vecSummary, String outputFilename, OutputStreamWriter xmlWriter,
String lookupFile, String backgroundFile, String count, boolean fromFunction) {
int result = 0;
try {
FileOutputStream stream;// provides file access
OutputStreamWriter writer;// writes to the file
stream = new FileOutputStream(new File(outputFilename), true);
writer = new OutputStreamWriter(stream);
// ***** 1 create a copy of original data *****
Instances oriData = root.getData();
Instances data = new Instances(oriData);
// ***** 2 remove attribute: orf_name(string attribute) *****
// data.deleteAttributeAt(0);
data.deleteStringAttributes();
// ***** 3 clustering *****
EM clusterer = new EM(); // new instance of clusterer
clusterer.buildClusterer(data); // build the clusterer
// evaluate cluster
ClusterEvaluation eval = new ClusterEvaluation();
eval.setClusterer(clusterer); // the cluster to evaluate
eval.evaluateClusterer(data); // data to evaluate the clusterer on
//get the rawGoID and zScore for the
//to be continued. AAA
String[] clusterNames = getClusterNames(oriData);
double[] goID = null;
if(fromFunction){
goID = this.getGoID(clusterNames, lookupFile, backgroundFile,count);
}else{
goID = this.getGoIDFromFunc(clusterNames, lookupFile, backgroundFile,count);
}
double logLikelihood = eval.getLogLikelihood();
writer.write("logLikelihood is: " + logLikelihood + "\n");
writer.write("GoID is: " + goID[0] + "\n");
writer.write("zScore is: " + goID[1] + "\n\n");
writer.flush();
// ***** 4 get the sub clusters *****
int numberOfSubCluster = eval.getNumClusters();
if (numberOfSubCluster > 1) {// not an end node
// create numberOfSubCluster instances array to store sub
// clusters
Instances[] subData = new Instances[numberOfSubCluster];
TreeNode[] subNode = new TreeNode[numberOfSubCluster];
for (int i = 0; i < numberOfSubCluster; i++) {
subData[i] = new Instances(oriData);
subData[i].delete();// keep only data head(attributes part)
}
// //System.out.println("\nlength is: " + data.numInstances());
// //System.out.println("number of clusters: " +
// numberOfSubCluster);
// //System.out.println(eval.clusterResultsToString());
double[] dArray = eval.getClusterAssignments();
for (int i = 0; i < dArray.length; i++) {
int clusterNumber = (int) dArray[i];
// //System.out.println("\ngene " + i + " is in cluster: "
// + clusterNumber + ",\tlog likelihood is:"
// + eval.getLogLikelihood());
// //System.out.println("***************");
// assign each gene to according cluster
for (int j = 0; j < subData.length; j++) {
if (j == clusterNumber) {
subData[j].add(oriData.instance(i));
}
}// end of inner j loop
}// end of outter i loop
// ***** 5 recursive call *****
String uniName = "";
// for (int i = 0; i <= generation; i++) {
// uniName += "0";
// }
uniName += generation + "-" + position;
generation++;
for (int i = 0; i < numberOfSubCluster; i++) {
String name = uniName + "-" + i;
//System.out.println("\n******************************");
//System.out.println("cluster name: " + name);
writer.write("\n******************************\n");
writer.write("cluster name: " + name + "\n");
writer.flush();
xmlWriter.write(" <branch>\n <attribute name=\"name\" value=\"" + name + "\"/>\n");
xmlWriter.flush();
subNode[i] = new TreeNode(name, eval.getLogLikelihood(),
subData[i], root);
result += clustering(subNode[i], generation,
position + "." + i, vecFinalTable, vecSummary, outputFilename,
xmlWriter,lookupFile,backgroundFile,count,fromFunction);
xmlWriter.write(" </branch>\n");
xmlWriter.flush();
}// end of for loop
} else { //for leaf node
//System.out.println("leaf node");
result = 1;
int temp = 1;
if (!vecSummary.isEmpty()) {
String strT = (vecSummary.lastElement().toString()).split(",")[1];
temp = Integer.parseInt(strT.trim()) + 1;
}
writer.write("leaf node\n");
writer.flush();
for (int i = 0; i < root.getData().numInstances(); i++) {
String strTemp = eval.getLogLikelihood() + "," + root.getData().instance(i) + "," + getAncestor(root, false) + "," + temp;
//System.out.println( strTemp);
writer.write(strTemp + "\n");
writer.flush();
xmlWriter.write("<leaf>\n <attribute name=\"name\" value=\"" + root.getData().instance(i).stringValue(0) + "\"/>\n</leaf>\n");
xmlWriter.flush();
vecFinalTable.addElement(strTemp);
}
vecSummary.addElement(getAncestor(root, false).toString() + "," + temp + "," + root.getData().numInstances() + "," + logLikelihood);
//System.out.println("******************************\n");
writer.write("******************************\n");
writer.flush();
generation--;
}//end of else
writer.close();
stream.close();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return result;
}//end of method "clustering"
/**
* output the root cluster name to file
* @param fileName output file name
* @param rootName thr root cluster name
*/
public void printRootName(String fileName, String rootName) {
try {
FileOutputStream stream;// provides file access
OutputStreamWriter writer;// writes to the file
stream = new FileOutputStream(new File(fileName), true);
writer = new OutputStreamWriter(stream);
writer.write("root cluster is:" + rootName + "\n");
writer.flush();
writer.close();
stream.close();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
/**
* print out the instance part of the data into a CSV formated table.
*
* @param data: the printed data set
*/
public String printTableHead(Instances data) {
String strResult = "likelihood";
for (int i = 0; i < data.numAttributes(); i++) {
String strTemp = "";
String[] strArr = data.attribute(i).toString().split("\\ ");
for (int j = 1; j < strArr.length - 1; j++) {
strTemp += strArr[j];
}
strResult += "," + strTemp;
}
return strResult + ",cluster origin,cluster ID";
}//end of method "printTalbe"
/**
* print the vector
* @param vec
*/
public void printVector(Vector vec, String outputFilename) {
//System.out.println("\n***************************");
//System.out.println("*** final result ***");
//System.out.println("***************************");
try {
FileOutputStream stream;// provides file access
OutputStreamWriter writer;// writes to the file
stream = new FileOutputStream(new File(outputFilename), false);
writer = new OutputStreamWriter(stream);
for (int i = 0; i < vec.size(); i++) {
//System.out.println(vec.elementAt(i));
writer.write(vec.elementAt(i).toString() + "\n");
}
writer.close();
stream.close();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
//System.out.println("\n***************************");
//System.out.println("*** end of final result ***");
//System.out.println("***************************");
}
/**
*
* @param endNode an leaf node
* @return a string contains all the ancestor's name of the node
*/
public String getAncestor(TreeNode endNode, boolean fromLeafNode) {
String strResult = endNode.getStrName();
TreeNode tempNode = endNode;
while (tempNode.getParent() != null) {
tempNode = tempNode.getParent();
strResult += "; " + tempNode.getStrName();
}
if (fromLeafNode) {
return strResult;
} else {
String newResult = "";
String[] history = strResult.split("\\;");
for (int i = history.length; i > 0; i--) {
newResult += history[i - 1] + "; ";
}
return newResult;
}
}
/**
* check the number of the arguments:
* java GetCluster arg1 arg2 ...
*
* @param length the length of the arguments
* in this program, length should be 1
*/
public void checkParameters(int length) {
if (length != 1) {
System.out.println("Usage: java GetCluster inputFileName");
System.exit(1);
}
}
/**
*
* @param inputFileName the name of the input file name
* @return an Instances of Weka Instances
*/
public Instances input(String inputFileName) {
String[] inputName = inputFileName.split("\\.");
Instances oriData = null;
try {
if (inputName[inputName.length - 1].compareToIgnoreCase("csv") == 0) {
// read from csv file
readCSV(inputFileName);
FileReader f = new FileReader(inputFileName + ".arff");
BufferedReader b = new BufferedReader(f);
oriData = new Instances(b);
} else if (inputName[inputName.length - 1].compareToIgnoreCase("arff") == 0) {
// read from arff data
FileReader f = new FileReader(inputFileName);
BufferedReader b = new BufferedReader(f);
oriData = new Instances(b);
} else {
System.out.println("only .arff or .csv format allowed!");
System.exit(1);
}
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return oriData;
}
/**
* read a csv file and convert to a arff file
* @param inputName the name of the csv file
*/
public void readCSV(String inputName) {
try {
FileReader fr = new FileReader(inputName);
BufferedReader br = new BufferedReader(fr);
FileOutputStream stream;// provides file access
OutputStreamWriter writer;// writes to the file
stream = new FileOutputStream(new File(inputName + ".arff"), false);
writer = new OutputStreamWriter(stream);
String strLine = br.readLine();
String[] varNameArray = strLine.split("\\,");
writer.write("@RELATION dataset" + "\n\n");
for (int i = 0; i < varNameArray.length; i++) {
if (i < 2) {
writer.write("@ATTRIBUTE" + " " + "\"" + varNameArray[i] + "\"" + " " + "string" + "\n");
} else {
writer.write("@ATTRIBUTE" + " " + "\"" + varNameArray[i] + "\"" + " " + "numeric" + "\n");
}
}
writer.write("\n@DATA\n");
while ((strLine = br.readLine()) != null) {
writer.write(strLine + "\n");
}
writer.close();
stream.close();
fr.close();
br.close();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
/**
*
* @param data
* @return an array contains the first element
* of each instance of input data
*/
private String[] getClusterNames(Instances data) {
String[] result = new String[data.numInstances()];
for (int i = 0; i < result.length; i++) {
String[] strArray = data.instance(i).toString().split("\\,");
result[i] = strArray[0];
}
return result;
}
private double[] getGoID(String[] clusterNames, String lookupFile, String backgroundFile, String count) {
//********************************
// part 2, calculate RawGoID
//********************************
double[] result = new double[2];
//initialize local variables:
RawGoID myRawGoID = new RawGoID(clusterNames, lookupFile, backgroundFile);
double clusterGoid = myRawGoID.getRawGoID();
double randomAve = 0.0;
double randomStd = 0.0;
double zScore = 0.0;
// System.out.println("real cluster raw GOid =" + clusterGoid);
// get 'repeat time' random rawGoIDs
double[] randomGoid = new double[Integer.parseInt(count)];
for (int i = 0; i < Integer.parseInt(count); i++) {
randomGoid[i] = myRawGoID.getRandomRawGoID();
// System.out.println("now is in loop :" + (i + 1));
// System.out.println("randomGOid = " + randomGoid[i]);
}
//calculate
randomAve = Stats.getMean(randomGoid);
randomStd = Stats.getStdDev(randomGoid);
zScore = Stats.getZscore(randomAve, randomStd, clusterGoid);
result[0] = clusterGoid;
result[1] = zScore;
return result;
}
private double[] getGoIDFromFunc(String[] clusterNames, String lookupFile, String backgroundFile, String count) {
//********************************
// part 2, calculate RawGoID
//********************************
double[] result = new double[2];
//initialize local variables:
RawGoID myRawGoID = new RawGoID(clusterNames, lookupFile, backgroundFile);
double clusterGoid = myRawGoID.getRawGoID();
double randomAve = 0.0;
double randomStd = 0.0;
double zScore = 0.0;
// System.out.println("real cluster raw GOid =" + clusterGoid);
// get 'repeat time' random rawGoIDs
double[] randomGoid = new double[Integer.parseInt(count)];
for (int i = 0; i < Integer.parseInt(count); i++) {
randomGoid[i] = myRawGoID.getRandomRawGoID();
// System.out.println("now is in loop :" + (i + 1));
// System.out.println("randomGOid = " + randomGoid[i]);
}
//calculate
randomAve = Stats.getMeanFromFunc(myRawGoID.getOriClusterSize());
randomStd = Stats.getStdDevFromFunc(myRawGoID.getOriClusterSize());
zScore = Stats.getZscore(randomAve, randomStd, clusterGoid);
result[0] = clusterGoid;
result[1] = zScore;
return result;
}
}//end of class
final class TreeNode {
private String strName;
private double dLikelihood;
private Instances data;
private TreeNode parent;
// TreeNode child;
/**
* @param strName name of node
* @param likelihood likelihood of the data
* @param data data set
* @param parent point to its parent node
* @param child point to its child node
*/
public TreeNode(String strName, double likelihood, Instances data, TreeNode parent) {
this.strName = strName;
dLikelihood = likelihood;
this.data = data;
this.parent = parent;
}
/**
* @return the data
*/
public Instances getData() {
return data;
}
/**
* @param data the data to set
*/
public void setData(Instances data) {
this.data = data;
}
/**
* @return the dLikelihood
*/
public double getDLikelihood() {
return dLikelihood;
}
/**
* @param likelihood the dLikelihood to set
*/
public void setDLikelihood(double likelihood) {
dLikelihood = likelihood;
}
/**
* @return the parent
*/
public TreeNode getParent() {
return parent;
}
/**
* @param parent the parent to set
*/
public void setParent(TreeNode parent) {
this.parent = parent;
}
/**
* @return the strName
*/
public String getStrName() {
return strName;
}
/**
* @param strName the strName to set
*/
public void setStrName(String strName) {
this.strName = strName;
}
}

View File

@@ -0,0 +1,165 @@
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.util.HashMap;
import java.util.Iterator;
/**
*
* @author DTian
*/
public class Information {
/**
*
* @param key the key of the dictionary
* @return the entropy
*/
public static double entropy(String [] data ) {
double entropy = 0;
// Frequency table
HashMap freqDict = new HashMap();
int one = 1;
for(int i=0; i<data.length; i++){
String newkey = data[i];
if (freqDict.containsKey(newkey)) {
int val = Integer.parseInt(freqDict.get(newkey).toString());
freqDict.remove(newkey);
val = val + 1;
freqDict.put(newkey, val + "");
} else {
freqDict.put(newkey, (one + ""));
}
}
// Probability table
HashMap probDict = new HashMap();
Iterator it = freqDict.keySet().iterator();
String newkey = "";
while (it.hasNext()) {
newkey = (String) it.next();
double value = 0.0;
value = Double.parseDouble((String) freqDict.get(newkey)) / data.length;
probDict.put(newkey, value + "");
}
// Calculate entropy
it = probDict.keySet().iterator();
while (it.hasNext()) {
newkey = (String) it.next();
double value = 0.0;
value = Double.parseDouble((String) probDict.get(newkey));
entropy = entropy - value * (Math.log(value) / Math.log(2));
}
return entropy;
}
public static double relativeEntropy(String[] data1, String[] data2) {
double result = 0;
// System.out.println(data1.length);
// Frequency table
HashMap freqDict1 = new HashMap();
int one = 1;
for(int i=0; i<data1.length; i++){
Object key = data1[i];
if(freqDict1.containsKey(key)){
int val = Integer.parseInt( freqDict1.get(key).toString());
//freqDict1.remove(key);
val++;
freqDict1.put(key, val + "");
} else {
freqDict1.put(key, (one + ""));
}
}
// toFileHM(freqDict1, "FreqDict1.txt");
HashMap freqDict2 = new HashMap();
for (int i=0; i<data2.length; i++) {
Object key = data2[i];
if (freqDict2.containsKey(key)) {
int val = Integer.parseInt(freqDict2.get(key).toString());
//freqDict2.remove(key);
val++;
freqDict2.put(key, val + "");
} else {
freqDict2.put(key, (one + ""));
}
}
// Probability table
HashMap<Object, Object> probDict1 = new HashMap<Object, Object>();
HashMap<Object, Object> probDict2 = new HashMap<Object, Object>();
Iterator it = freqDict1.keySet().iterator();
while (it.hasNext()) {
Object newkey = it.next();
double value = 0;
value = Double.parseDouble((String) freqDict1.get(newkey)) / data1.length;
probDict1.put(newkey, value + "");
}
it = freqDict2.keySet().iterator();
while (it.hasNext()) {
Object newkey = it.next();
double value = 0;
value = Double.parseDouble((String) freqDict2.get(newkey)) / data2.length;
probDict2.put(newkey, value + "");
}
// Calculate the relative entropy
it = probDict1.keySet().iterator();
while (it.hasNext()) {
Object newkey = it.next();
Object value1 = probDict1.get(newkey);
//Object value2 = probDict2.get(newkey);
double dValue1 = Double.parseDouble(probDict1.get(newkey).toString());
double dValue2 = Double.parseDouble(probDict2.get(newkey).toString());
if ( value1.toString().trim().compareToIgnoreCase("1.0") == 0) {
result = result + dValue1 * (Math.log(dValue1/dValue2) / Math.log(2));
} else if (value1.toString().trim().compareToIgnoreCase("0") == 0){
result = result + (1-dValue1) * (Math.log((1-dValue1)/(1-dValue2)) / Math.log(2));
} else {
result = result + dValue1 * (Math.log(dValue1/dValue2) / Math.log(2));
result = result + (1-dValue1) * (Math.log((1-dValue1)/(1-dValue2)) / Math.log(2));
}
// toFile(result+"", "probDict1.txt");
// toFile(result, "resultsOfresult.txt");//check point by Jingyu
}
//toFile(probDict1.size()+ "*******************", "probDict1.txt");
//System.out.println("relative entropy = " + result);
return result;
}
private static void toFile(String data, String filename) {
// Output to file
try {
BufferedWriter writer = new BufferedWriter(new FileWriter(filename,true));
writer.write(data + "\n");
writer.close();
} catch (Exception e) {
System.err.println(e.getStackTrace());
}
}
private static void toFileHM(HashMap data, String filename) {
// Output to file
try {
BufferedWriter writer = new BufferedWriter(new FileWriter(filename, true));
for (Object key : data.keySet()) {
writer.write(key.toString() +":"+ data.get(key)+"\n");
}
writer.close();
} catch (Exception e) {
System.err.println(e.getStackTrace());
}
}
}

View File

@@ -0,0 +1,130 @@
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
/**
*
* @author DTian
*/
public class Matrix {
private HashMap matrix; // store data
private int rowSize; // row size of matrix
private int colSize; // column size of value array
private final int lookupTableSize = 9000; //size of look up table
public int getColSize() {
return colSize;
}
public void setColSize(int colSize) {
this.colSize = colSize;
}
public HashMap getMatrix() {
return matrix;
}
public void setMatrix(HashMap matrix) {
this.matrix = matrix;
}
public int getRowSize() {
return rowSize;
}
public void setRowSize(int rowSize) {
this.rowSize = rowSize;
}
public Matrix() {
rowSize = 0;
colSize = 0;
matrix = new HashMap();
}
/**
* constructor with 1 String parameter
*
* @param filename : the name of the input file
*
* @result: create a matrix from a input file
*
*/
public Matrix(String filename) {
// Initialize variables
this.setRowSize(0);
this.setColSize(0);
matrix = new HashMap(lookupTableSize);
try {
FileReader fr = new FileReader(filename);
BufferedReader br = new BufferedReader(fr);
// strRow is used to read line from file(skip first row)
String strRow = br.readLine();
// The while loop read the data from data file to vvf
while ((strRow = br.readLine()) != null) {
// strArray was used to store the float value from data file in
// string format
String delimiter = "";
if (strRow.indexOf(",") >= 0) { //for CSV file
delimiter = "\\,";
} else { // for whitespace delimited file
delimiter = "\\s";
}
String[] strArray = strRow.trim().split(delimiter);
String[] strArrValue = Arrays.copyOfRange(strArray, 1, strArray.length);
// strArray[0] is the orf name, others are value
matrix.put(strArray[0].trim().toLowerCase(), strArrValue);
rowSize++;
colSize = strArrValue.length;
}
br.close();
fr.close();
} catch (IOException e) {
// catch possible io errors from readLine()
System.out.println("IOException error in 'class Matrix, constructor'");
}
}
/**
*
* @param index, the specifed key
* @return: the string array of the value
*/
public String[] getSpecifiedValue(Object key) {
return (String[]) matrix.get(key);
}
/**
* @return the list of orf names
*/
public ArrayList getOrfNames() {
ArrayList result = new ArrayList(this.getRowSize());
Iterator it = matrix.keySet().iterator();
while (it.hasNext()) {
result.add(it.next());
}
return result;
}
public void addValue(Object key, Object value) {
matrix.put(key, value);
}
}

View File

@@ -0,0 +1,375 @@
/*
* the input: 3 files, 1 is cluster file, 2 is Go matrix file (lookup table)
* 3 is back ground file(pool)
*
*
*/
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Random;
/**
*
* @author DTian
*/
public class RawGoID {
private ArrayList clusterGeneList; // for the input cluster file
private Matrix poolTable; //for the filtered gene pool list
private Matrix lookupTable; // for the lookup attribute table
private int oriClusterSize; //for the original cluster size
private ArrayList oriPoolOrfsName;//for the complete list of pool table
// private String randomFilename;
public Matrix getLookupTable() {
return lookupTable;
}
public void setLookupTable(Matrix lookupTable) {
this.lookupTable = lookupTable;
}
public Matrix getPoolTable() {
return poolTable;
}
public void setPoolTable(Matrix poolTable) {
this.poolTable = poolTable;
}
public ArrayList getClusterGeneList() {
return clusterGeneList;
}
public void setClusterGeneList(ArrayList clusterGeneList) {
this.clusterGeneList = clusterGeneList;
}
public RawGoID() {
clusterGeneList = new ArrayList();
poolTable = new Matrix();
lookupTable = new Matrix();
// randomFilename ="";
}
public void setOriClusterSize(int oriClusterSize) {
this.oriClusterSize = oriClusterSize;
}
public int getOriClusterSize () {
return oriClusterSize;
}
public void setOriPoolOrfsName(ArrayList oriPoolOrfsName) {
this.oriPoolOrfsName = oriPoolOrfsName;
}
public ArrayList getOriPoolOrfsName() {
return oriPoolOrfsName;
}
/**
*
* @param clusterFilename : cluster Filename
* @param GoMatrixFilename : GoMatrix Filename
* @param backGroundFilename : backGround Filename
*/
public RawGoID(String clusterFilename, String GoMatrixFilename, String backGroundFilename) {
try {
clusterGeneList = new ArrayList(200);
ArrayList refClusterGeneList = new ArrayList (200);
// Get the smallGeneList (a cluster )
BufferedReader br = new BufferedReader(new FileReader(clusterFilename));
// strRow is used to read line from file
String strRow = "";
while ((strRow = br.readLine()) != null) {
clusterGeneList.add(strRow.trim().toLowerCase());
}
// System.out.println(clusterGeneList.size());
setOriClusterSize(clusterGeneList.size());
// System.out.println("original cluster size =" + clusterGeneList.size());
// Get the mtrix (lookup table)
lookupTable = new Matrix(GoMatrixFilename);
// Get the bigGeneList (pool or back ground file)
br = new BufferedReader(new FileReader(backGroundFilename));
ArrayList poolOrfsName = new ArrayList(5000);
while ((strRow = br.readLine()) != null) {
poolOrfsName.add(strRow.trim().toLowerCase());
}
this.setOriPoolOrfsName(poolOrfsName);
poolTable = new Matrix();
for (int i = 0; i < poolOrfsName.size(); i++) {
Object tempKey = poolOrfsName.get(i);
if (lookupTable.getMatrix().containsKey(tempKey)) {
poolTable.addValue(tempKey, lookupTable.getMatrix().get(tempKey));
}
}
poolTable.setRowSize(poolTable.getMatrix().size());
poolTable.setColSize(lookupTable.getColSize());
br.close();
// This loop is to take out any ORF from the cluster gene list if not exist in pool table
// not necessary if all cluster ORFs are from pool table
for (int i=0;i<refClusterGeneList.size();i++){
Object tempKey = clusterGeneList.get(i);
if (!poolTable.getMatrix().containsKey(tempKey)){
clusterGeneList.remove(i);
}
}
// System.out.println("length of real cluster gene List after filtering = " + clusterGeneList.size());
// Check point
// System.out.println(clusterGeneList);
} catch (IOException e) {
// Catch possible io errors from readLine()
System.out.println("IOException error in 'class GetGoID, constructor'");
}
// Checkpoint
// System.out.println("Column size of pooltable is:"+ poolTable.getColSize() +'\t'+ "Row size of pooltable is :"+ poolTable.getRowSize());
// randomFilename = "randomOrfName.txt";
}
/**
*
* @param clusterFilename : cluster Filename
* @param GoMatrixFilename : GoMatrix Filename
* @param backGroundFilename : backGround Filename
*/
public RawGoID(String[] clusterName, String GoMatrixFilename, String backGroundFilename) {
try {
clusterGeneList = new ArrayList(clusterName.length);
ArrayList refClusterGeneList = new ArrayList (200);
// Get the smallGeneList (a cluster )
for(String name: clusterName){
clusterGeneList.add(name.trim().toLowerCase());
}
// System.out.println(clusterGeneList.size());
setOriClusterSize(clusterGeneList.size());
// System.out.println("original cluster size =" + clusterGeneList.size());
// Get the mtrix (lookup table)
lookupTable = new Matrix(GoMatrixFilename);
// Get the bigGeneList (pool or back ground file)
BufferedReader br = new BufferedReader(new FileReader(backGroundFilename));
ArrayList poolOrfsName = new ArrayList(5000);
String strRow = "";
while ((strRow = br.readLine()) != null) {
poolOrfsName.add(strRow.trim().toLowerCase());
}
this.setOriPoolOrfsName(poolOrfsName);
poolTable = new Matrix();
for (int i = 0; i < poolOrfsName.size(); i++) {
Object tempKey = poolOrfsName.get(i);
if(lookupTable.getMatrix().containsKey(tempKey)){
poolTable.addValue(tempKey, lookupTable.getMatrix().get(tempKey));
}
}
poolTable.setRowSize(poolTable.getMatrix().size());
poolTable.setColSize(lookupTable.getColSize());
br.close();
// This loop is to take out any ORF from the cluster gene list if not exist in pool table
// not necessary if all cluster ORFs are from pool table
for (int i=0;i<refClusterGeneList.size();i++){
Object tempKey = clusterGeneList.get(i);
if (!poolTable.getMatrix().containsKey(tempKey)){
clusterGeneList.remove(i);
}
}
// System.out.println("length of real cluster gene List after filtering = " + clusterGeneList.size());
// Checkpoint
// System.out.println(clusterGeneList);
} catch (IOException e) {
// Catch possible io errors from readLine()
System.out.println("IOException error in 'class GetGoID, constructor'");
}
// Checkpoint
// System.out.println("Column size of pooltable is:"+ poolTable.getColSize() +'\t'+ "Row size of pooltable is :"+ poolTable.getRowSize());
// randomFilename = "randomOrfName.txt";
}
public double getRawGoID() {
double result = 0.0;
ArrayList fullMatrix = new ArrayList(this.getPoolTable().getRowSize());
ArrayList subMatrix = new ArrayList(this.getClusterGeneList().size());
// Fill the fullMatrix with pool table data
Iterator it = this.getPoolTable().getMatrix().keySet().iterator();
while (it.hasNext()) {
Object key = it.next();
fullMatrix.add(this.getPoolTable().getMatrix().get(key.toString().toLowerCase()));
}
// System.out.println("size of fullMatrix is:"+ fullMatrix.size());
// Fill the subMatrix with lookup table data and cluster information
for (Object element : this.getClusterGeneList()) {
if (this.getLookupTable().getMatrix().containsKey(element)) {
subMatrix.add(this.getLookupTable().getMatrix().get(element.toString().toLowerCase()));
}
}
// System.out.println("size of subMatrix is:"+ subMatrix.size());
// Transpose the 2 matrix
ArrayList attrByOrfFullMatrix = this.transpose(fullMatrix);
ArrayList attrByOrfSubMatrix = this.transpose(subMatrix);
// System.out.println("size of transposed fullMatrix is:"+ attrByOrfFullMatrix.size());
// System.out.println("size of transposed subMatrix is:"+ attrByOrfSubMatrix.size());
// Calculate the raw GoID
for (int i = 0; i < attrByOrfFullMatrix.size(); i++) {
// Added by tdh, from the source code, we need not do this step
int nonZeroCount = 0;
String[] tempArray = (String[]) attrByOrfFullMatrix.get(i);
for (int j = 0; j < tempArray.length; j++) {
// System.out.println(Integer.parseInt(tempArray[j].trim()));
if (tempArray[j].trim().compareToIgnoreCase("1")==0 ) {
// System.out.println(Integer.parseInt(tempArray[j].trim()));
// System.out.println(tempArray[j].trim().compareToIgnoreCase("1"));//Jingyu added for checking
nonZeroCount++;
break; // added by tdh, from the source code, we need not do this step//Jingyu notes: the break may help the code running faster.
// System.out.println("nonzeroCount =" + nonZeroCount);//Jingyu add
}
}
// System.out.println("nonzeroCount =" + nonZeroCount);//Jingyu add
if (nonZeroCount >= 0) {
result = result + Information.relativeEntropy(
((String[]) attrByOrfSubMatrix.get(i)),
(String[]) (attrByOrfFullMatrix.get(i)));
}
// System.out.println(Information.relativeEntropy(
// ((String[]) attrByOrfSubMatrix.get(i)), (String[]) (attrByOrfFullMatrix.get(i))));
}
// System.out.println("result =" + result);
return result;
}
private ArrayList transpose(ArrayList data) {
ArrayList result = new ArrayList(data.size());
// Do transpose here
int rowSize = data.size();
int colSize = ((String[]) data.get(0)).length;
String[][] matrix = new String[colSize][rowSize];
for (int i = 0; i < rowSize; i++) {
String[] temp = (String[]) data.get(i);
for (int j = 0; j < colSize; j++) {
// System.out.println("j is : " + j);
matrix[j][i] = temp[j];
}
}
// Convert to ArrayList
for (int i = 0; i < matrix.length; i++) {
result.add(matrix[i]);
}
return result;
}
public double getRandomRawGoID() {
double result = 0.0;
this.setClusterGeneList(this.getRandomCluster(this.getOriClusterSize()));
result = this.getRawGoID();
if (Double.isNaN(result)) {
return getRandomRawGoID();
} else {
return result;
}
}
private void toFile(HashMap data, String filename) {
// Output to a file
try {
BufferedWriter writer = new BufferedWriter(new FileWriter(filename));
for (Object key : data.keySet()) {
writer.write(key.toString() + "\n");
}
writer.close();
} catch (Exception e) {
System.err.println(e.getStackTrace());
}
}
private static void toFileString(String data, String filename) {
// Output to a file
try {
BufferedWriter writer = new BufferedWriter(new FileWriter(filename,true));
writer.write(data + "\n");
writer.close();
} catch (Exception e) {
System.err.println(e.getStackTrace());
}
}
private ArrayList getRandomCluster(int clusterSize) {
ArrayList<String> result = new ArrayList(clusterSize);
// Jingyu: The following segment of code, which is deactivated, is designed to get the random cluster list from a lookuptable-filtered pooltable
// Jingyu: To do so may cause a bias in average of random raw GOid score by using a smaller pool list
// get a random cluster with same size of the cluster file and then calculate the Goid
// 1, get the random orf names to a ArrayList
// HashMap hm = new HashMap(this.getClusterGeneList().size());
// while (hm.keySet().size() < clusterSize) {
// hm.put(this.getPoolTable().getOrfNames().get(randInt(this.getPoolTable().getOrfNames().size())), "0");
// }
// result.addAll(hm.keySet());
// Get a random cluster with same size of the cluster file from the original ORF pool
// Extra step added by Jingyu to remove the ORFs not existing in pooltable;
ArrayList localOriPoolTable = new ArrayList();
localOriPoolTable = this.getOriPoolOrfsName();
// Checkpoint
// System.out.println(localOriPoolTable.size());
for (int i=0;i<clusterSize;i++){
result.add(localOriPoolTable.get(randInt(localOriPoolTable.size())).toString().trim().toLowerCase());
}
return result;
}
/**
*
* @param max the max integer you want to generate
*
* @return : a random integar between 0 and max
*/
private int randInt(int max) {
Random r = new Random((int) (System.nanoTime()));
int random = r.nextInt();
random = Math.abs(random);
random = random % max;
// random += 1;
return random;
}
// This method is not used for the final code.
// private String [] getZeroStringArray(int length) {
// String [] tmpStrArray = new String[length];
// for (int j=0; j<tmpStrArray.length; j++) {
// tmpStrArray[j] = "0";
// }
// return tmpStrArray;
// }
}

View File

@@ -0,0 +1,155 @@
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.HashMap;
import java.util.Iterator;
/*
* This program starts by creating an intermediate table and then will load the function from Dr. Brett McKinney to create the attribute table.
*/
/**
*
* @author DTian
*/
public class SGD2AttrTable {
public void createIntermediateTable(String inputFile, String outputFile) {
HashMap geneToGODict = new HashMap();
try {
FileReader fr = new FileReader(inputFile);
BufferedReader br = new BufferedReader(fr);
// strRow is used to read line from file(skip first row)
String strRow = br.readLine();
// The while loop read the data from data file to vvf
while ((strRow = br.readLine()) != null) {
// Check: skip the line if it is a comment line
if (strRow.trim().charAt(0) != 'S' ) {
continue;
}
String [] strArray = strRow.trim().split("\\t");
String key = toKey(strArray[10].toUpperCase());
if (key.compareToIgnoreCase("") == 0) {
continue;
}
String value = toValue(strArray[4]);
if (geneToGODict.containsKey(key)) {
geneToGODict.put(key, geneToGODict.get(key)+ "\t" + value);
} else {
geneToGODict.put(key, value);
}
}
br.close();
fr.close();
// Write to output file
FileOutputStream stream; // provides file access
OutputStreamWriter writer; // writes to the file
stream = new FileOutputStream(new File(outputFile), true);
writer = new OutputStreamWriter(stream);
Iterator it = geneToGODict.keySet().iterator();
while(it.hasNext()){
String key = it.next().toString();
String value = geneToGODict.get(key).toString();
writer.write(key + "\t" + value + "\n");
}
writer.flush();
writer.close();
stream.close();
} catch (IOException e) {
// Catch possible io errors from readLine()
System.out.println("IOException error in 'class SGD2AttrTable, method createIntermediateTable'");
}
}
public void createAttrTable(String intermediaFile, String outputFile){
HashMap geneToGODict = new HashMap();
try {
FileReader fr = new FileReader(intermediaFile);
BufferedReader br = new BufferedReader(fr);
// strRow is used to read line from file(skip first row)
String strRow = br.readLine();
// The while loop read the data from data file to vvf
while ((strRow = br.readLine()) != null) {
//check: skip the line if it is a comment line
if (strRow.trim().charAt(0) != 'S' ) {
continue;
}
String [] strArray = strRow.trim().split("\\t");
String key = toKey(strArray[10].toUpperCase());
if (key.compareToIgnoreCase("") == 0) {
continue;
}
String value = toValue(strArray[4]);
if (geneToGODict.containsKey(key)) {
geneToGODict.put(key, geneToGODict.get(key)+ "\t" + value);
} else {
geneToGODict.put(key, value);
}
}
br.close();
fr.close();
// Write to output file
FileOutputStream stream; // provides file access
OutputStreamWriter writer; // writes to the file
stream = new FileOutputStream(new File(outputFile), true);
writer = new OutputStreamWriter(stream);
Iterator it = geneToGODict.keySet().iterator();
while (it.hasNext()) {
String key = it.next().toString();
String value = geneToGODict.get(key).toString();
writer.write(key + "\t" + value + "\n");
}
writer.flush();
writer.close();
stream.close();
} catch (IOException e) {
// Catch possible io errors from readLine()
System.out.println("IOException error in 'class SGD2AttrTable, method createIntermediateTable'");
}
}
/**
*
* @param raw the string need to be get rid of the "GO:0s"
* @return the string without "GO:00"
*/
private String toValue(String raw) {
String result = raw.toUpperCase(); //raw should be like: "GO:0005739"
// Delete "GO:"
result = result.substring(3);
// Delete "lead zeros"
while (result.charAt(0) == '0') {
result =result.substring(1);
}
return result;
}
private String toKey(String raw) {
String result = raw.toUpperCase(); // raw should be like: "GO:0005739"
// Find the '|'
int end = result.indexOf('|');
// Get the sub string
if (end < 0) {
return result;
} else {
return result.substring(0, end);
}
}
}

View File

@@ -0,0 +1,75 @@
/*
* def stats(self,r):
#returns the average, standard deviation, and min of a sequence
tot = sum(r)
ave = tot/len(r)
sdsq = sum([(i-ave)**2 for i in r])
s = list(r)
s.sort()
#median = s[len(s)//2]
return ave, (sdsq/(len(r)-1 or 1))**.5
def zscore(self,pop_mean,pop_std,raw_goid):
return (raw_goid - pop_mean)/pop_std
*/
/**
*
* @author DTian
*/
public class Stats {
/**
*
* @param data the double array
* @return the stand deviation of the array
*/
public static double getStdDev(double[] data) {
double result = 0.0;
double ave = getMean(data);
for (double d : data) {
result += Math.pow((d-ave), 2);
}
if (data.length>1) {
return Math.sqrt(result/(data.length-1));
} else {
return Math.sqrt(result/1);
}
}
/**
*
* @param data the double array
* @return the mean of the double array.
*/
public static double getMean(double[] data) {
double result = 0.0;
for (double d : data) {
result += d;
}
return (result/data.length);
}
/**
*
* @param size the size of ori cluster File
* @return the mean of the double array.
*/
public static double getMeanFromFunc(int size) {
return ( -4.8616 + 71.1806/Math.pow(size, 0.33511));
}
/**
*
* @param size the size of ori cluster File
* @return the mean of the double array.
*/
public static double getStdDevFromFunc(int size) {
return ( -0.04943 + 56.634/Math.pow(size, 0.89384));
}
public static double getZscore(double popMean, double popStd, double rawGoid) {
return (rawGoid - popMean)/popStd;
}
}