376 lines
14 KiB
Java
Executable File
376 lines
14 KiB
Java
Executable File
/*
|
|
* the input: 3 files, 1 is cluster file, 2 is Go matrix file (lookup table)
|
|
* 3 is back ground file(pool)
|
|
*
|
|
*
|
|
*/
|
|
|
|
import java.io.BufferedReader;
|
|
import java.io.BufferedWriter;
|
|
import java.io.FileReader;
|
|
import java.io.FileWriter;
|
|
import java.io.IOException;
|
|
import java.util.ArrayList;
|
|
import java.util.HashMap;
|
|
import java.util.Iterator;
|
|
import java.util.Random;
|
|
|
|
/**
|
|
*
|
|
* @author DTian
|
|
*/
|
|
public class RawGoID {
|
|
|
|
private ArrayList clusterGeneList; // for the input cluster file
|
|
private Matrix poolTable; //for the filtered gene pool list
|
|
private Matrix lookupTable; // for the lookup attribute table
|
|
private int oriClusterSize; //for the original cluster size
|
|
private ArrayList oriPoolOrfsName;//for the complete list of pool table
|
|
// private String randomFilename;
|
|
|
|
public Matrix getLookupTable() {
|
|
return lookupTable;
|
|
}
|
|
|
|
public void setLookupTable(Matrix lookupTable) {
|
|
this.lookupTable = lookupTable;
|
|
}
|
|
|
|
public Matrix getPoolTable() {
|
|
return poolTable;
|
|
}
|
|
|
|
public void setPoolTable(Matrix poolTable) {
|
|
this.poolTable = poolTable;
|
|
}
|
|
|
|
public ArrayList getClusterGeneList() {
|
|
return clusterGeneList;
|
|
}
|
|
|
|
public void setClusterGeneList(ArrayList clusterGeneList) {
|
|
this.clusterGeneList = clusterGeneList;
|
|
}
|
|
|
|
public RawGoID() {
|
|
clusterGeneList = new ArrayList();
|
|
poolTable = new Matrix();
|
|
lookupTable = new Matrix();
|
|
// randomFilename ="";
|
|
}
|
|
|
|
public void setOriClusterSize(int oriClusterSize) {
|
|
this.oriClusterSize = oriClusterSize;
|
|
}
|
|
|
|
public int getOriClusterSize () {
|
|
return oriClusterSize;
|
|
}
|
|
|
|
public void setOriPoolOrfsName(ArrayList oriPoolOrfsName) {
|
|
this.oriPoolOrfsName = oriPoolOrfsName;
|
|
}
|
|
|
|
public ArrayList getOriPoolOrfsName() {
|
|
return oriPoolOrfsName;
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param clusterFilename : cluster Filename
|
|
* @param GoMatrixFilename : GoMatrix Filename
|
|
* @param backGroundFilename : backGround Filename
|
|
*/
|
|
public RawGoID(String clusterFilename, String GoMatrixFilename, String backGroundFilename) {
|
|
try {
|
|
clusterGeneList = new ArrayList(200);
|
|
ArrayList refClusterGeneList = new ArrayList (200);
|
|
|
|
// Get the smallGeneList (a cluster )
|
|
BufferedReader br = new BufferedReader(new FileReader(clusterFilename));
|
|
|
|
// strRow is used to read line from file
|
|
String strRow = "";
|
|
while ((strRow = br.readLine()) != null) {
|
|
clusterGeneList.add(strRow.trim().toLowerCase());
|
|
}
|
|
// System.out.println(clusterGeneList.size());
|
|
setOriClusterSize(clusterGeneList.size());
|
|
// System.out.println("original cluster size =" + clusterGeneList.size());
|
|
|
|
// Get the mtrix (lookup table)
|
|
lookupTable = new Matrix(GoMatrixFilename);
|
|
|
|
// Get the bigGeneList (pool or back ground file)
|
|
br = new BufferedReader(new FileReader(backGroundFilename));
|
|
|
|
ArrayList poolOrfsName = new ArrayList(5000);
|
|
while ((strRow = br.readLine()) != null) {
|
|
poolOrfsName.add(strRow.trim().toLowerCase());
|
|
}
|
|
this.setOriPoolOrfsName(poolOrfsName);
|
|
poolTable = new Matrix();
|
|
for (int i = 0; i < poolOrfsName.size(); i++) {
|
|
Object tempKey = poolOrfsName.get(i);
|
|
if (lookupTable.getMatrix().containsKey(tempKey)) {
|
|
poolTable.addValue(tempKey, lookupTable.getMatrix().get(tempKey));
|
|
}
|
|
}
|
|
poolTable.setRowSize(poolTable.getMatrix().size());
|
|
poolTable.setColSize(lookupTable.getColSize());
|
|
br.close();
|
|
|
|
// This loop is to take out any ORF from the cluster gene list if not exist in pool table
|
|
// not necessary if all cluster ORFs are from pool table
|
|
for (int i=0;i<refClusterGeneList.size();i++){
|
|
Object tempKey = clusterGeneList.get(i);
|
|
if (!poolTable.getMatrix().containsKey(tempKey)){
|
|
clusterGeneList.remove(i);
|
|
}
|
|
}
|
|
|
|
// System.out.println("length of real cluster gene List after filtering = " + clusterGeneList.size());
|
|
// Check point
|
|
// System.out.println(clusterGeneList);
|
|
} catch (IOException e) {
|
|
// Catch possible io errors from readLine()
|
|
System.out.println("IOException error in 'class GetGoID, constructor'");
|
|
}
|
|
|
|
// Checkpoint
|
|
// System.out.println("Column size of pooltable is:"+ poolTable.getColSize() +'\t'+ "Row size of pooltable is :"+ poolTable.getRowSize());
|
|
// randomFilename = "randomOrfName.txt";
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param clusterFilename : cluster Filename
|
|
* @param GoMatrixFilename : GoMatrix Filename
|
|
* @param backGroundFilename : backGround Filename
|
|
*/
|
|
public RawGoID(String[] clusterName, String GoMatrixFilename, String backGroundFilename) {
|
|
try {
|
|
clusterGeneList = new ArrayList(clusterName.length);
|
|
ArrayList refClusterGeneList = new ArrayList (200);
|
|
|
|
// Get the smallGeneList (a cluster )
|
|
for(String name: clusterName){
|
|
clusterGeneList.add(name.trim().toLowerCase());
|
|
}
|
|
|
|
// System.out.println(clusterGeneList.size());
|
|
setOriClusterSize(clusterGeneList.size());
|
|
// System.out.println("original cluster size =" + clusterGeneList.size());
|
|
|
|
// Get the mtrix (lookup table)
|
|
lookupTable = new Matrix(GoMatrixFilename);
|
|
|
|
// Get the bigGeneList (pool or back ground file)
|
|
BufferedReader br = new BufferedReader(new FileReader(backGroundFilename));
|
|
ArrayList poolOrfsName = new ArrayList(5000);
|
|
String strRow = "";
|
|
while ((strRow = br.readLine()) != null) {
|
|
poolOrfsName.add(strRow.trim().toLowerCase());
|
|
}
|
|
this.setOriPoolOrfsName(poolOrfsName);
|
|
poolTable = new Matrix();
|
|
for (int i = 0; i < poolOrfsName.size(); i++) {
|
|
Object tempKey = poolOrfsName.get(i);
|
|
if(lookupTable.getMatrix().containsKey(tempKey)){
|
|
poolTable.addValue(tempKey, lookupTable.getMatrix().get(tempKey));
|
|
}
|
|
}
|
|
poolTable.setRowSize(poolTable.getMatrix().size());
|
|
poolTable.setColSize(lookupTable.getColSize());
|
|
br.close();
|
|
|
|
// This loop is to take out any ORF from the cluster gene list if not exist in pool table
|
|
// not necessary if all cluster ORFs are from pool table
|
|
for (int i=0;i<refClusterGeneList.size();i++){
|
|
Object tempKey = clusterGeneList.get(i);
|
|
if (!poolTable.getMatrix().containsKey(tempKey)){
|
|
clusterGeneList.remove(i);
|
|
}
|
|
}
|
|
|
|
// System.out.println("length of real cluster gene List after filtering = " + clusterGeneList.size());
|
|
// Checkpoint
|
|
// System.out.println(clusterGeneList);
|
|
} catch (IOException e) {
|
|
// Catch possible io errors from readLine()
|
|
System.out.println("IOException error in 'class GetGoID, constructor'");
|
|
}
|
|
|
|
// Checkpoint
|
|
// System.out.println("Column size of pooltable is:"+ poolTable.getColSize() +'\t'+ "Row size of pooltable is :"+ poolTable.getRowSize());
|
|
// randomFilename = "randomOrfName.txt";
|
|
}
|
|
|
|
public double getRawGoID() {
|
|
double result = 0.0;
|
|
ArrayList fullMatrix = new ArrayList(this.getPoolTable().getRowSize());
|
|
ArrayList subMatrix = new ArrayList(this.getClusterGeneList().size());
|
|
|
|
// Fill the fullMatrix with pool table data
|
|
Iterator it = this.getPoolTable().getMatrix().keySet().iterator();
|
|
while (it.hasNext()) {
|
|
Object key = it.next();
|
|
fullMatrix.add(this.getPoolTable().getMatrix().get(key.toString().toLowerCase()));
|
|
}
|
|
// System.out.println("size of fullMatrix is:"+ fullMatrix.size());
|
|
|
|
// Fill the subMatrix with lookup table data and cluster information
|
|
for (Object element : this.getClusterGeneList()) {
|
|
if (this.getLookupTable().getMatrix().containsKey(element)) {
|
|
subMatrix.add(this.getLookupTable().getMatrix().get(element.toString().toLowerCase()));
|
|
}
|
|
}
|
|
// System.out.println("size of subMatrix is:"+ subMatrix.size());
|
|
|
|
// Transpose the 2 matrix
|
|
ArrayList attrByOrfFullMatrix = this.transpose(fullMatrix);
|
|
ArrayList attrByOrfSubMatrix = this.transpose(subMatrix);
|
|
|
|
// System.out.println("size of transposed fullMatrix is:"+ attrByOrfFullMatrix.size());
|
|
// System.out.println("size of transposed subMatrix is:"+ attrByOrfSubMatrix.size());
|
|
|
|
// Calculate the raw GoID
|
|
for (int i = 0; i < attrByOrfFullMatrix.size(); i++) {
|
|
// Added by tdh, from the source code, we need not do this step
|
|
int nonZeroCount = 0;
|
|
String[] tempArray = (String[]) attrByOrfFullMatrix.get(i);
|
|
for (int j = 0; j < tempArray.length; j++) {
|
|
// System.out.println(Integer.parseInt(tempArray[j].trim()));
|
|
if (tempArray[j].trim().compareToIgnoreCase("1")==0 ) {
|
|
// System.out.println(Integer.parseInt(tempArray[j].trim()));
|
|
// System.out.println(tempArray[j].trim().compareToIgnoreCase("1"));//Jingyu added for checking
|
|
nonZeroCount++;
|
|
break; // added by tdh, from the source code, we need not do this step//Jingyu notes: the break may help the code running faster.
|
|
// System.out.println("nonzeroCount =" + nonZeroCount);//Jingyu add
|
|
}
|
|
}
|
|
// System.out.println("nonzeroCount =" + nonZeroCount);//Jingyu add
|
|
if (nonZeroCount >= 0) {
|
|
result = result + Information.relativeEntropy(
|
|
((String[]) attrByOrfSubMatrix.get(i)),
|
|
(String[]) (attrByOrfFullMatrix.get(i)));
|
|
}
|
|
// System.out.println(Information.relativeEntropy(
|
|
// ((String[]) attrByOrfSubMatrix.get(i)), (String[]) (attrByOrfFullMatrix.get(i))));
|
|
}
|
|
// System.out.println("result =" + result);
|
|
return result;
|
|
}
|
|
|
|
private ArrayList transpose(ArrayList data) {
|
|
ArrayList result = new ArrayList(data.size());
|
|
// Do transpose here
|
|
int rowSize = data.size();
|
|
int colSize = ((String[]) data.get(0)).length;
|
|
|
|
String[][] matrix = new String[colSize][rowSize];
|
|
for (int i = 0; i < rowSize; i++) {
|
|
String[] temp = (String[]) data.get(i);
|
|
for (int j = 0; j < colSize; j++) {
|
|
// System.out.println("j is : " + j);
|
|
matrix[j][i] = temp[j];
|
|
}
|
|
}
|
|
|
|
// Convert to ArrayList
|
|
for (int i = 0; i < matrix.length; i++) {
|
|
result.add(matrix[i]);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
public double getRandomRawGoID() {
|
|
double result = 0.0;
|
|
this.setClusterGeneList(this.getRandomCluster(this.getOriClusterSize()));
|
|
result = this.getRawGoID();
|
|
if (Double.isNaN(result)) {
|
|
return getRandomRawGoID();
|
|
} else {
|
|
return result;
|
|
}
|
|
}
|
|
|
|
private void toFile(HashMap data, String filename) {
|
|
|
|
// Output to a file
|
|
try {
|
|
BufferedWriter writer = new BufferedWriter(new FileWriter(filename));
|
|
for (Object key : data.keySet()) {
|
|
writer.write(key.toString() + "\n");
|
|
}
|
|
writer.close();
|
|
} catch (Exception e) {
|
|
System.err.println(e.getStackTrace());
|
|
}
|
|
|
|
}
|
|
private static void toFileString(String data, String filename) {
|
|
|
|
// Output to a file
|
|
try {
|
|
BufferedWriter writer = new BufferedWriter(new FileWriter(filename,true));
|
|
writer.write(data + "\n");
|
|
writer.close();
|
|
} catch (Exception e) {
|
|
System.err.println(e.getStackTrace());
|
|
}
|
|
}
|
|
|
|
private ArrayList getRandomCluster(int clusterSize) {
|
|
ArrayList<String> result = new ArrayList(clusterSize);
|
|
|
|
// Jingyu: The following segment of code, which is deactivated, is designed to get the random cluster list from a lookuptable-filtered pooltable
|
|
// Jingyu: To do so may cause a bias in average of random raw GOid score by using a smaller pool list
|
|
// get a random cluster with same size of the cluster file and then calculate the Goid
|
|
// 1, get the random orf names to a ArrayList
|
|
|
|
// HashMap hm = new HashMap(this.getClusterGeneList().size());
|
|
// while (hm.keySet().size() < clusterSize) {
|
|
// hm.put(this.getPoolTable().getOrfNames().get(randInt(this.getPoolTable().getOrfNames().size())), "0");
|
|
// }
|
|
// result.addAll(hm.keySet());
|
|
|
|
// Get a random cluster with same size of the cluster file from the original ORF pool
|
|
// Extra step added by Jingyu to remove the ORFs not existing in pooltable;
|
|
ArrayList localOriPoolTable = new ArrayList();
|
|
localOriPoolTable = this.getOriPoolOrfsName();
|
|
|
|
// Checkpoint
|
|
// System.out.println(localOriPoolTable.size());
|
|
for (int i=0;i<clusterSize;i++){
|
|
result.add(localOriPoolTable.get(randInt(localOriPoolTable.size())).toString().trim().toLowerCase());
|
|
}
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param max the max integer you want to generate
|
|
*
|
|
* @return : a random integar between 0 and max
|
|
*/
|
|
private int randInt(int max) {
|
|
Random r = new Random((int) (System.nanoTime()));
|
|
int random = r.nextInt();
|
|
random = Math.abs(random);
|
|
random = random % max;
|
|
// random += 1;
|
|
return random;
|
|
}
|
|
|
|
// This method is not used for the final code.
|
|
// private String [] getZeroStringArray(int length) {
|
|
// String [] tmpStrArray = new String[length];
|
|
// for (int j=0; j<tmpStrArray.length; j++) {
|
|
// tmpStrArray[j] = "0";
|
|
// }
|
|
// return tmpStrArray;
|
|
// }
|
|
}
|
|
|