Squashed initial commit
This commit is contained in:
375
qhtcp-workflow/apps/java/weka-clustering/src/RawGoID.java
Executable file
375
qhtcp-workflow/apps/java/weka-clustering/src/RawGoID.java
Executable file
@@ -0,0 +1,375 @@
|
||||
/*
|
||||
* the input: 3 files, 1 is cluster file, 2 is Go matrix file (lookup table)
|
||||
* 3 is back ground file(pool)
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.FileReader;
|
||||
import java.io.FileWriter;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Random;
|
||||
|
||||
/**
|
||||
*
|
||||
* @author DTian
|
||||
*/
|
||||
public class RawGoID {
|
||||
|
||||
private ArrayList clusterGeneList; // for the input cluster file
|
||||
private Matrix poolTable; //for the filtered gene pool list
|
||||
private Matrix lookupTable; // for the lookup attribute table
|
||||
private int oriClusterSize; //for the original cluster size
|
||||
private ArrayList oriPoolOrfsName;//for the complete list of pool table
|
||||
// private String randomFilename;
|
||||
|
||||
public Matrix getLookupTable() {
|
||||
return lookupTable;
|
||||
}
|
||||
|
||||
public void setLookupTable(Matrix lookupTable) {
|
||||
this.lookupTable = lookupTable;
|
||||
}
|
||||
|
||||
public Matrix getPoolTable() {
|
||||
return poolTable;
|
||||
}
|
||||
|
||||
public void setPoolTable(Matrix poolTable) {
|
||||
this.poolTable = poolTable;
|
||||
}
|
||||
|
||||
public ArrayList getClusterGeneList() {
|
||||
return clusterGeneList;
|
||||
}
|
||||
|
||||
public void setClusterGeneList(ArrayList clusterGeneList) {
|
||||
this.clusterGeneList = clusterGeneList;
|
||||
}
|
||||
|
||||
public RawGoID() {
|
||||
clusterGeneList = new ArrayList();
|
||||
poolTable = new Matrix();
|
||||
lookupTable = new Matrix();
|
||||
// randomFilename ="";
|
||||
}
|
||||
|
||||
public void setOriClusterSize(int oriClusterSize) {
|
||||
this.oriClusterSize = oriClusterSize;
|
||||
}
|
||||
|
||||
public int getOriClusterSize () {
|
||||
return oriClusterSize;
|
||||
}
|
||||
|
||||
public void setOriPoolOrfsName(ArrayList oriPoolOrfsName) {
|
||||
this.oriPoolOrfsName = oriPoolOrfsName;
|
||||
}
|
||||
|
||||
public ArrayList getOriPoolOrfsName() {
|
||||
return oriPoolOrfsName;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param clusterFilename : cluster Filename
|
||||
* @param GoMatrixFilename : GoMatrix Filename
|
||||
* @param backGroundFilename : backGround Filename
|
||||
*/
|
||||
public RawGoID(String clusterFilename, String GoMatrixFilename, String backGroundFilename) {
|
||||
try {
|
||||
clusterGeneList = new ArrayList(200);
|
||||
ArrayList refClusterGeneList = new ArrayList (200);
|
||||
|
||||
// Get the smallGeneList (a cluster )
|
||||
BufferedReader br = new BufferedReader(new FileReader(clusterFilename));
|
||||
|
||||
// strRow is used to read line from file
|
||||
String strRow = "";
|
||||
while ((strRow = br.readLine()) != null) {
|
||||
clusterGeneList.add(strRow.trim().toLowerCase());
|
||||
}
|
||||
// System.out.println(clusterGeneList.size());
|
||||
setOriClusterSize(clusterGeneList.size());
|
||||
// System.out.println("original cluster size =" + clusterGeneList.size());
|
||||
|
||||
// Get the mtrix (lookup table)
|
||||
lookupTable = new Matrix(GoMatrixFilename);
|
||||
|
||||
// Get the bigGeneList (pool or back ground file)
|
||||
br = new BufferedReader(new FileReader(backGroundFilename));
|
||||
|
||||
ArrayList poolOrfsName = new ArrayList(5000);
|
||||
while ((strRow = br.readLine()) != null) {
|
||||
poolOrfsName.add(strRow.trim().toLowerCase());
|
||||
}
|
||||
this.setOriPoolOrfsName(poolOrfsName);
|
||||
poolTable = new Matrix();
|
||||
for (int i = 0; i < poolOrfsName.size(); i++) {
|
||||
Object tempKey = poolOrfsName.get(i);
|
||||
if (lookupTable.getMatrix().containsKey(tempKey)) {
|
||||
poolTable.addValue(tempKey, lookupTable.getMatrix().get(tempKey));
|
||||
}
|
||||
}
|
||||
poolTable.setRowSize(poolTable.getMatrix().size());
|
||||
poolTable.setColSize(lookupTable.getColSize());
|
||||
br.close();
|
||||
|
||||
// This loop is to take out any ORF from the cluster gene list if not exist in pool table
|
||||
// not necessary if all cluster ORFs are from pool table
|
||||
for (int i=0;i<refClusterGeneList.size();i++){
|
||||
Object tempKey = clusterGeneList.get(i);
|
||||
if (!poolTable.getMatrix().containsKey(tempKey)){
|
||||
clusterGeneList.remove(i);
|
||||
}
|
||||
}
|
||||
|
||||
// System.out.println("length of real cluster gene List after filtering = " + clusterGeneList.size());
|
||||
// Check point
|
||||
// System.out.println(clusterGeneList);
|
||||
} catch (IOException e) {
|
||||
// Catch possible io errors from readLine()
|
||||
System.out.println("IOException error in 'class GetGoID, constructor'");
|
||||
}
|
||||
|
||||
// Checkpoint
|
||||
// System.out.println("Column size of pooltable is:"+ poolTable.getColSize() +'\t'+ "Row size of pooltable is :"+ poolTable.getRowSize());
|
||||
// randomFilename = "randomOrfName.txt";
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param clusterFilename : cluster Filename
|
||||
* @param GoMatrixFilename : GoMatrix Filename
|
||||
* @param backGroundFilename : backGround Filename
|
||||
*/
|
||||
public RawGoID(String[] clusterName, String GoMatrixFilename, String backGroundFilename) {
|
||||
try {
|
||||
clusterGeneList = new ArrayList(clusterName.length);
|
||||
ArrayList refClusterGeneList = new ArrayList (200);
|
||||
|
||||
// Get the smallGeneList (a cluster )
|
||||
for(String name: clusterName){
|
||||
clusterGeneList.add(name.trim().toLowerCase());
|
||||
}
|
||||
|
||||
// System.out.println(clusterGeneList.size());
|
||||
setOriClusterSize(clusterGeneList.size());
|
||||
// System.out.println("original cluster size =" + clusterGeneList.size());
|
||||
|
||||
// Get the mtrix (lookup table)
|
||||
lookupTable = new Matrix(GoMatrixFilename);
|
||||
|
||||
// Get the bigGeneList (pool or back ground file)
|
||||
BufferedReader br = new BufferedReader(new FileReader(backGroundFilename));
|
||||
ArrayList poolOrfsName = new ArrayList(5000);
|
||||
String strRow = "";
|
||||
while ((strRow = br.readLine()) != null) {
|
||||
poolOrfsName.add(strRow.trim().toLowerCase());
|
||||
}
|
||||
this.setOriPoolOrfsName(poolOrfsName);
|
||||
poolTable = new Matrix();
|
||||
for (int i = 0; i < poolOrfsName.size(); i++) {
|
||||
Object tempKey = poolOrfsName.get(i);
|
||||
if(lookupTable.getMatrix().containsKey(tempKey)){
|
||||
poolTable.addValue(tempKey, lookupTable.getMatrix().get(tempKey));
|
||||
}
|
||||
}
|
||||
poolTable.setRowSize(poolTable.getMatrix().size());
|
||||
poolTable.setColSize(lookupTable.getColSize());
|
||||
br.close();
|
||||
|
||||
// This loop is to take out any ORF from the cluster gene list if not exist in pool table
|
||||
// not necessary if all cluster ORFs are from pool table
|
||||
for (int i=0;i<refClusterGeneList.size();i++){
|
||||
Object tempKey = clusterGeneList.get(i);
|
||||
if (!poolTable.getMatrix().containsKey(tempKey)){
|
||||
clusterGeneList.remove(i);
|
||||
}
|
||||
}
|
||||
|
||||
// System.out.println("length of real cluster gene List after filtering = " + clusterGeneList.size());
|
||||
// Checkpoint
|
||||
// System.out.println(clusterGeneList);
|
||||
} catch (IOException e) {
|
||||
// Catch possible io errors from readLine()
|
||||
System.out.println("IOException error in 'class GetGoID, constructor'");
|
||||
}
|
||||
|
||||
// Checkpoint
|
||||
// System.out.println("Column size of pooltable is:"+ poolTable.getColSize() +'\t'+ "Row size of pooltable is :"+ poolTable.getRowSize());
|
||||
// randomFilename = "randomOrfName.txt";
|
||||
}
|
||||
|
||||
public double getRawGoID() {
|
||||
double result = 0.0;
|
||||
ArrayList fullMatrix = new ArrayList(this.getPoolTable().getRowSize());
|
||||
ArrayList subMatrix = new ArrayList(this.getClusterGeneList().size());
|
||||
|
||||
// Fill the fullMatrix with pool table data
|
||||
Iterator it = this.getPoolTable().getMatrix().keySet().iterator();
|
||||
while (it.hasNext()) {
|
||||
Object key = it.next();
|
||||
fullMatrix.add(this.getPoolTable().getMatrix().get(key.toString().toLowerCase()));
|
||||
}
|
||||
// System.out.println("size of fullMatrix is:"+ fullMatrix.size());
|
||||
|
||||
// Fill the subMatrix with lookup table data and cluster information
|
||||
for (Object element : this.getClusterGeneList()) {
|
||||
if (this.getLookupTable().getMatrix().containsKey(element)) {
|
||||
subMatrix.add(this.getLookupTable().getMatrix().get(element.toString().toLowerCase()));
|
||||
}
|
||||
}
|
||||
// System.out.println("size of subMatrix is:"+ subMatrix.size());
|
||||
|
||||
// Transpose the 2 matrix
|
||||
ArrayList attrByOrfFullMatrix = this.transpose(fullMatrix);
|
||||
ArrayList attrByOrfSubMatrix = this.transpose(subMatrix);
|
||||
|
||||
// System.out.println("size of transposed fullMatrix is:"+ attrByOrfFullMatrix.size());
|
||||
// System.out.println("size of transposed subMatrix is:"+ attrByOrfSubMatrix.size());
|
||||
|
||||
// Calculate the raw GoID
|
||||
for (int i = 0; i < attrByOrfFullMatrix.size(); i++) {
|
||||
// Added by tdh, from the source code, we need not do this step
|
||||
int nonZeroCount = 0;
|
||||
String[] tempArray = (String[]) attrByOrfFullMatrix.get(i);
|
||||
for (int j = 0; j < tempArray.length; j++) {
|
||||
// System.out.println(Integer.parseInt(tempArray[j].trim()));
|
||||
if (tempArray[j].trim().compareToIgnoreCase("1")==0 ) {
|
||||
// System.out.println(Integer.parseInt(tempArray[j].trim()));
|
||||
// System.out.println(tempArray[j].trim().compareToIgnoreCase("1"));//Jingyu added for checking
|
||||
nonZeroCount++;
|
||||
break; // added by tdh, from the source code, we need not do this step//Jingyu notes: the break may help the code running faster.
|
||||
// System.out.println("nonzeroCount =" + nonZeroCount);//Jingyu add
|
||||
}
|
||||
}
|
||||
// System.out.println("nonzeroCount =" + nonZeroCount);//Jingyu add
|
||||
if (nonZeroCount >= 0) {
|
||||
result = result + Information.relativeEntropy(
|
||||
((String[]) attrByOrfSubMatrix.get(i)),
|
||||
(String[]) (attrByOrfFullMatrix.get(i)));
|
||||
}
|
||||
// System.out.println(Information.relativeEntropy(
|
||||
// ((String[]) attrByOrfSubMatrix.get(i)), (String[]) (attrByOrfFullMatrix.get(i))));
|
||||
}
|
||||
// System.out.println("result =" + result);
|
||||
return result;
|
||||
}
|
||||
|
||||
private ArrayList transpose(ArrayList data) {
|
||||
ArrayList result = new ArrayList(data.size());
|
||||
// Do transpose here
|
||||
int rowSize = data.size();
|
||||
int colSize = ((String[]) data.get(0)).length;
|
||||
|
||||
String[][] matrix = new String[colSize][rowSize];
|
||||
for (int i = 0; i < rowSize; i++) {
|
||||
String[] temp = (String[]) data.get(i);
|
||||
for (int j = 0; j < colSize; j++) {
|
||||
// System.out.println("j is : " + j);
|
||||
matrix[j][i] = temp[j];
|
||||
}
|
||||
}
|
||||
|
||||
// Convert to ArrayList
|
||||
for (int i = 0; i < matrix.length; i++) {
|
||||
result.add(matrix[i]);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public double getRandomRawGoID() {
|
||||
double result = 0.0;
|
||||
this.setClusterGeneList(this.getRandomCluster(this.getOriClusterSize()));
|
||||
result = this.getRawGoID();
|
||||
if (Double.isNaN(result)) {
|
||||
return getRandomRawGoID();
|
||||
} else {
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
private void toFile(HashMap data, String filename) {
|
||||
|
||||
// Output to a file
|
||||
try {
|
||||
BufferedWriter writer = new BufferedWriter(new FileWriter(filename));
|
||||
for (Object key : data.keySet()) {
|
||||
writer.write(key.toString() + "\n");
|
||||
}
|
||||
writer.close();
|
||||
} catch (Exception e) {
|
||||
System.err.println(e.getStackTrace());
|
||||
}
|
||||
|
||||
}
|
||||
private static void toFileString(String data, String filename) {
|
||||
|
||||
// Output to a file
|
||||
try {
|
||||
BufferedWriter writer = new BufferedWriter(new FileWriter(filename,true));
|
||||
writer.write(data + "\n");
|
||||
writer.close();
|
||||
} catch (Exception e) {
|
||||
System.err.println(e.getStackTrace());
|
||||
}
|
||||
}
|
||||
|
||||
private ArrayList getRandomCluster(int clusterSize) {
|
||||
ArrayList<String> result = new ArrayList(clusterSize);
|
||||
|
||||
// Jingyu: The following segment of code, which is deactivated, is designed to get the random cluster list from a lookuptable-filtered pooltable
|
||||
// Jingyu: To do so may cause a bias in average of random raw GOid score by using a smaller pool list
|
||||
// get a random cluster with same size of the cluster file and then calculate the Goid
|
||||
// 1, get the random orf names to a ArrayList
|
||||
|
||||
// HashMap hm = new HashMap(this.getClusterGeneList().size());
|
||||
// while (hm.keySet().size() < clusterSize) {
|
||||
// hm.put(this.getPoolTable().getOrfNames().get(randInt(this.getPoolTable().getOrfNames().size())), "0");
|
||||
// }
|
||||
// result.addAll(hm.keySet());
|
||||
|
||||
// Get a random cluster with same size of the cluster file from the original ORF pool
|
||||
// Extra step added by Jingyu to remove the ORFs not existing in pooltable;
|
||||
ArrayList localOriPoolTable = new ArrayList();
|
||||
localOriPoolTable = this.getOriPoolOrfsName();
|
||||
|
||||
// Checkpoint
|
||||
// System.out.println(localOriPoolTable.size());
|
||||
for (int i=0;i<clusterSize;i++){
|
||||
result.add(localOriPoolTable.get(randInt(localOriPoolTable.size())).toString().trim().toLowerCase());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param max the max integer you want to generate
|
||||
*
|
||||
* @return : a random integar between 0 and max
|
||||
*/
|
||||
private int randInt(int max) {
|
||||
Random r = new Random((int) (System.nanoTime()));
|
||||
int random = r.nextInt();
|
||||
random = Math.abs(random);
|
||||
random = random % max;
|
||||
// random += 1;
|
||||
return random;
|
||||
}
|
||||
|
||||
// This method is not used for the final code.
|
||||
// private String [] getZeroStringArray(int length) {
|
||||
// String [] tmpStrArray = new String[length];
|
||||
// for (int j=0; j<tmpStrArray.length; j++) {
|
||||
// tmpStrArray[j] = "0";
|
||||
// }
|
||||
// return tmpStrArray;
|
||||
// }
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user