Files
hartman-server/qhtcp-workflow/apps/java/weka-clustering/src/RawGoID.java

376 lines
14 KiB
Java
Executable File

/*
* the input: 3 files, 1 is cluster file, 2 is Go matrix file (lookup table)
* 3 is back ground file(pool)
*
*
*/
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Random;
/**
*
* @author DTian
*/
public class RawGoID {
private ArrayList clusterGeneList; // for the input cluster file
private Matrix poolTable; //for the filtered gene pool list
private Matrix lookupTable; // for the lookup attribute table
private int oriClusterSize; //for the original cluster size
private ArrayList oriPoolOrfsName;//for the complete list of pool table
// private String randomFilename;
public Matrix getLookupTable() {
return lookupTable;
}
public void setLookupTable(Matrix lookupTable) {
this.lookupTable = lookupTable;
}
public Matrix getPoolTable() {
return poolTable;
}
public void setPoolTable(Matrix poolTable) {
this.poolTable = poolTable;
}
public ArrayList getClusterGeneList() {
return clusterGeneList;
}
public void setClusterGeneList(ArrayList clusterGeneList) {
this.clusterGeneList = clusterGeneList;
}
public RawGoID() {
clusterGeneList = new ArrayList();
poolTable = new Matrix();
lookupTable = new Matrix();
// randomFilename ="";
}
public void setOriClusterSize(int oriClusterSize) {
this.oriClusterSize = oriClusterSize;
}
public int getOriClusterSize () {
return oriClusterSize;
}
public void setOriPoolOrfsName(ArrayList oriPoolOrfsName) {
this.oriPoolOrfsName = oriPoolOrfsName;
}
public ArrayList getOriPoolOrfsName() {
return oriPoolOrfsName;
}
/**
*
* @param clusterFilename : cluster Filename
* @param GoMatrixFilename : GoMatrix Filename
* @param backGroundFilename : backGround Filename
*/
public RawGoID(String clusterFilename, String GoMatrixFilename, String backGroundFilename) {
try {
clusterGeneList = new ArrayList(200);
ArrayList refClusterGeneList = new ArrayList (200);
// Get the smallGeneList (a cluster )
BufferedReader br = new BufferedReader(new FileReader(clusterFilename));
// strRow is used to read line from file
String strRow = "";
while ((strRow = br.readLine()) != null) {
clusterGeneList.add(strRow.trim().toLowerCase());
}
// System.out.println(clusterGeneList.size());
setOriClusterSize(clusterGeneList.size());
// System.out.println("original cluster size =" + clusterGeneList.size());
// Get the mtrix (lookup table)
lookupTable = new Matrix(GoMatrixFilename);
// Get the bigGeneList (pool or back ground file)
br = new BufferedReader(new FileReader(backGroundFilename));
ArrayList poolOrfsName = new ArrayList(5000);
while ((strRow = br.readLine()) != null) {
poolOrfsName.add(strRow.trim().toLowerCase());
}
this.setOriPoolOrfsName(poolOrfsName);
poolTable = new Matrix();
for (int i = 0; i < poolOrfsName.size(); i++) {
Object tempKey = poolOrfsName.get(i);
if (lookupTable.getMatrix().containsKey(tempKey)) {
poolTable.addValue(tempKey, lookupTable.getMatrix().get(tempKey));
}
}
poolTable.setRowSize(poolTable.getMatrix().size());
poolTable.setColSize(lookupTable.getColSize());
br.close();
// This loop is to take out any ORF from the cluster gene list if not exist in pool table
// not necessary if all cluster ORFs are from pool table
for (int i=0;i<refClusterGeneList.size();i++){
Object tempKey = clusterGeneList.get(i);
if (!poolTable.getMatrix().containsKey(tempKey)){
clusterGeneList.remove(i);
}
}
// System.out.println("length of real cluster gene List after filtering = " + clusterGeneList.size());
// Check point
// System.out.println(clusterGeneList);
} catch (IOException e) {
// Catch possible io errors from readLine()
System.out.println("IOException error in 'class GetGoID, constructor'");
}
// Checkpoint
// System.out.println("Column size of pooltable is:"+ poolTable.getColSize() +'\t'+ "Row size of pooltable is :"+ poolTable.getRowSize());
// randomFilename = "randomOrfName.txt";
}
/**
*
* @param clusterFilename : cluster Filename
* @param GoMatrixFilename : GoMatrix Filename
* @param backGroundFilename : backGround Filename
*/
public RawGoID(String[] clusterName, String GoMatrixFilename, String backGroundFilename) {
try {
clusterGeneList = new ArrayList(clusterName.length);
ArrayList refClusterGeneList = new ArrayList (200);
// Get the smallGeneList (a cluster )
for(String name: clusterName){
clusterGeneList.add(name.trim().toLowerCase());
}
// System.out.println(clusterGeneList.size());
setOriClusterSize(clusterGeneList.size());
// System.out.println("original cluster size =" + clusterGeneList.size());
// Get the mtrix (lookup table)
lookupTable = new Matrix(GoMatrixFilename);
// Get the bigGeneList (pool or back ground file)
BufferedReader br = new BufferedReader(new FileReader(backGroundFilename));
ArrayList poolOrfsName = new ArrayList(5000);
String strRow = "";
while ((strRow = br.readLine()) != null) {
poolOrfsName.add(strRow.trim().toLowerCase());
}
this.setOriPoolOrfsName(poolOrfsName);
poolTable = new Matrix();
for (int i = 0; i < poolOrfsName.size(); i++) {
Object tempKey = poolOrfsName.get(i);
if(lookupTable.getMatrix().containsKey(tempKey)){
poolTable.addValue(tempKey, lookupTable.getMatrix().get(tempKey));
}
}
poolTable.setRowSize(poolTable.getMatrix().size());
poolTable.setColSize(lookupTable.getColSize());
br.close();
// This loop is to take out any ORF from the cluster gene list if not exist in pool table
// not necessary if all cluster ORFs are from pool table
for (int i=0;i<refClusterGeneList.size();i++){
Object tempKey = clusterGeneList.get(i);
if (!poolTable.getMatrix().containsKey(tempKey)){
clusterGeneList.remove(i);
}
}
// System.out.println("length of real cluster gene List after filtering = " + clusterGeneList.size());
// Checkpoint
// System.out.println(clusterGeneList);
} catch (IOException e) {
// Catch possible io errors from readLine()
System.out.println("IOException error in 'class GetGoID, constructor'");
}
// Checkpoint
// System.out.println("Column size of pooltable is:"+ poolTable.getColSize() +'\t'+ "Row size of pooltable is :"+ poolTable.getRowSize());
// randomFilename = "randomOrfName.txt";
}
public double getRawGoID() {
double result = 0.0;
ArrayList fullMatrix = new ArrayList(this.getPoolTable().getRowSize());
ArrayList subMatrix = new ArrayList(this.getClusterGeneList().size());
// Fill the fullMatrix with pool table data
Iterator it = this.getPoolTable().getMatrix().keySet().iterator();
while (it.hasNext()) {
Object key = it.next();
fullMatrix.add(this.getPoolTable().getMatrix().get(key.toString().toLowerCase()));
}
// System.out.println("size of fullMatrix is:"+ fullMatrix.size());
// Fill the subMatrix with lookup table data and cluster information
for (Object element : this.getClusterGeneList()) {
if (this.getLookupTable().getMatrix().containsKey(element)) {
subMatrix.add(this.getLookupTable().getMatrix().get(element.toString().toLowerCase()));
}
}
// System.out.println("size of subMatrix is:"+ subMatrix.size());
// Transpose the 2 matrix
ArrayList attrByOrfFullMatrix = this.transpose(fullMatrix);
ArrayList attrByOrfSubMatrix = this.transpose(subMatrix);
// System.out.println("size of transposed fullMatrix is:"+ attrByOrfFullMatrix.size());
// System.out.println("size of transposed subMatrix is:"+ attrByOrfSubMatrix.size());
// Calculate the raw GoID
for (int i = 0; i < attrByOrfFullMatrix.size(); i++) {
// Added by tdh, from the source code, we need not do this step
int nonZeroCount = 0;
String[] tempArray = (String[]) attrByOrfFullMatrix.get(i);
for (int j = 0; j < tempArray.length; j++) {
// System.out.println(Integer.parseInt(tempArray[j].trim()));
if (tempArray[j].trim().compareToIgnoreCase("1")==0 ) {
// System.out.println(Integer.parseInt(tempArray[j].trim()));
// System.out.println(tempArray[j].trim().compareToIgnoreCase("1"));//Jingyu added for checking
nonZeroCount++;
break; // added by tdh, from the source code, we need not do this step//Jingyu notes: the break may help the code running faster.
// System.out.println("nonzeroCount =" + nonZeroCount);//Jingyu add
}
}
// System.out.println("nonzeroCount =" + nonZeroCount);//Jingyu add
if (nonZeroCount >= 0) {
result = result + Information.relativeEntropy(
((String[]) attrByOrfSubMatrix.get(i)),
(String[]) (attrByOrfFullMatrix.get(i)));
}
// System.out.println(Information.relativeEntropy(
// ((String[]) attrByOrfSubMatrix.get(i)), (String[]) (attrByOrfFullMatrix.get(i))));
}
// System.out.println("result =" + result);
return result;
}
private ArrayList transpose(ArrayList data) {
ArrayList result = new ArrayList(data.size());
// Do transpose here
int rowSize = data.size();
int colSize = ((String[]) data.get(0)).length;
String[][] matrix = new String[colSize][rowSize];
for (int i = 0; i < rowSize; i++) {
String[] temp = (String[]) data.get(i);
for (int j = 0; j < colSize; j++) {
// System.out.println("j is : " + j);
matrix[j][i] = temp[j];
}
}
// Convert to ArrayList
for (int i = 0; i < matrix.length; i++) {
result.add(matrix[i]);
}
return result;
}
public double getRandomRawGoID() {
double result = 0.0;
this.setClusterGeneList(this.getRandomCluster(this.getOriClusterSize()));
result = this.getRawGoID();
if (Double.isNaN(result)) {
return getRandomRawGoID();
} else {
return result;
}
}
private void toFile(HashMap data, String filename) {
// Output to a file
try {
BufferedWriter writer = new BufferedWriter(new FileWriter(filename));
for (Object key : data.keySet()) {
writer.write(key.toString() + "\n");
}
writer.close();
} catch (Exception e) {
System.err.println(e.getStackTrace());
}
}
private static void toFileString(String data, String filename) {
// Output to a file
try {
BufferedWriter writer = new BufferedWriter(new FileWriter(filename,true));
writer.write(data + "\n");
writer.close();
} catch (Exception e) {
System.err.println(e.getStackTrace());
}
}
private ArrayList getRandomCluster(int clusterSize) {
ArrayList<String> result = new ArrayList(clusterSize);
// Jingyu: The following segment of code, which is deactivated, is designed to get the random cluster list from a lookuptable-filtered pooltable
// Jingyu: To do so may cause a bias in average of random raw GOid score by using a smaller pool list
// get a random cluster with same size of the cluster file and then calculate the Goid
// 1, get the random orf names to a ArrayList
// HashMap hm = new HashMap(this.getClusterGeneList().size());
// while (hm.keySet().size() < clusterSize) {
// hm.put(this.getPoolTable().getOrfNames().get(randInt(this.getPoolTable().getOrfNames().size())), "0");
// }
// result.addAll(hm.keySet());
// Get a random cluster with same size of the cluster file from the original ORF pool
// Extra step added by Jingyu to remove the ORFs not existing in pooltable;
ArrayList localOriPoolTable = new ArrayList();
localOriPoolTable = this.getOriPoolOrfsName();
// Checkpoint
// System.out.println(localOriPoolTable.size());
for (int i=0;i<clusterSize;i++){
result.add(localOriPoolTable.get(randInt(localOriPoolTable.size())).toString().trim().toLowerCase());
}
return result;
}
/**
*
* @param max the max integer you want to generate
*
* @return : a random integar between 0 and max
*/
private int randInt(int max) {
Random r = new Random((int) (System.nanoTime()));
int random = r.nextInt();
random = Math.abs(random);
random = random % max;
// random += 1;
return random;
}
// This method is not used for the final code.
// private String [] getZeroStringArray(int length) {
// String [] tmpStrArray = new String[length];
// for (int j=0; j<tmpStrArray.length; j++) {
// tmpStrArray[j] = "0";
// }
// return tmpStrArray;
// }
}