123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375 |
- /*
- * the input: 3 files, 1 is cluster file, 2 is Go matrix file (lookup table)
- * 3 is back ground file(pool)
- *
- *
- */
- import java.io.BufferedReader;
- import java.io.BufferedWriter;
- import java.io.FileReader;
- import java.io.FileWriter;
- import java.io.IOException;
- import java.util.ArrayList;
- import java.util.HashMap;
- import java.util.Iterator;
- import java.util.Random;
- /**
- *
- * @author DTian
- */
- public class RawGoID {
- private ArrayList clusterGeneList; // for the input cluster file
- private Matrix poolTable; //for the filtered gene pool list
- private Matrix lookupTable; // for the lookup attribute table
- private int oriClusterSize; //for the original cluster size
- private ArrayList oriPoolOrfsName;//for the complete list of pool table
- // private String randomFilename;
- public Matrix getLookupTable() {
- return lookupTable;
- }
- public void setLookupTable(Matrix lookupTable) {
- this.lookupTable = lookupTable;
- }
- public Matrix getPoolTable() {
- return poolTable;
- }
- public void setPoolTable(Matrix poolTable) {
- this.poolTable = poolTable;
- }
- public ArrayList getClusterGeneList() {
- return clusterGeneList;
- }
- public void setClusterGeneList(ArrayList clusterGeneList) {
- this.clusterGeneList = clusterGeneList;
- }
- public RawGoID() {
- clusterGeneList = new ArrayList();
- poolTable = new Matrix();
- lookupTable = new Matrix();
- // randomFilename ="";
- }
-
- public void setOriClusterSize(int oriClusterSize) {
- this.oriClusterSize = oriClusterSize;
- }
- public int getOriClusterSize () {
- return oriClusterSize;
- }
-
- public void setOriPoolOrfsName(ArrayList oriPoolOrfsName) {
- this.oriPoolOrfsName = oriPoolOrfsName;
- }
- public ArrayList getOriPoolOrfsName() {
- return oriPoolOrfsName;
- }
-
- /**
- *
- * @param clusterFilename : cluster Filename
- * @param GoMatrixFilename : GoMatrix Filename
- * @param backGroundFilename : backGround Filename
- */
- public RawGoID(String clusterFilename, String GoMatrixFilename, String backGroundFilename) {
- try {
- clusterGeneList = new ArrayList(200);
- ArrayList refClusterGeneList = new ArrayList (200);
- // Get the smallGeneList (a cluster )
- BufferedReader br = new BufferedReader(new FileReader(clusterFilename));
- // strRow is used to read line from file
- String strRow = "";
- while ((strRow = br.readLine()) != null) {
- clusterGeneList.add(strRow.trim().toLowerCase());
- }
- // System.out.println(clusterGeneList.size());
- setOriClusterSize(clusterGeneList.size());
- // System.out.println("original cluster size =" + clusterGeneList.size());
-
- // Get the mtrix (lookup table)
- lookupTable = new Matrix(GoMatrixFilename);
- // Get the bigGeneList (pool or back ground file)
- br = new BufferedReader(new FileReader(backGroundFilename));
- ArrayList poolOrfsName = new ArrayList(5000);
- while ((strRow = br.readLine()) != null) {
- poolOrfsName.add(strRow.trim().toLowerCase());
- }
- this.setOriPoolOrfsName(poolOrfsName);
- poolTable = new Matrix();
- for (int i = 0; i < poolOrfsName.size(); i++) {
- Object tempKey = poolOrfsName.get(i);
- if (lookupTable.getMatrix().containsKey(tempKey)) {
- poolTable.addValue(tempKey, lookupTable.getMatrix().get(tempKey));
- }
- }
- poolTable.setRowSize(poolTable.getMatrix().size());
- poolTable.setColSize(lookupTable.getColSize());
- br.close();
-
- // This loop is to take out any ORF from the cluster gene list if not exist in pool table
- // not necessary if all cluster ORFs are from pool table
- for (int i=0;i<refClusterGeneList.size();i++){
- Object tempKey = clusterGeneList.get(i);
- if (!poolTable.getMatrix().containsKey(tempKey)){
- clusterGeneList.remove(i);
- }
- }
- // System.out.println("length of real cluster gene List after filtering = " + clusterGeneList.size());
- // Check point
- // System.out.println(clusterGeneList);
- } catch (IOException e) {
- // Catch possible io errors from readLine()
- System.out.println("IOException error in 'class GetGoID, constructor'");
- }
- // Checkpoint
- // System.out.println("Column size of pooltable is:"+ poolTable.getColSize() +'\t'+ "Row size of pooltable is :"+ poolTable.getRowSize());
- // randomFilename = "randomOrfName.txt";
- }
- /**
- *
- * @param clusterFilename : cluster Filename
- * @param GoMatrixFilename : GoMatrix Filename
- * @param backGroundFilename : backGround Filename
- */
- public RawGoID(String[] clusterName, String GoMatrixFilename, String backGroundFilename) {
- try {
- clusterGeneList = new ArrayList(clusterName.length);
- ArrayList refClusterGeneList = new ArrayList (200);
- // Get the smallGeneList (a cluster )
- for(String name: clusterName){
- clusterGeneList.add(name.trim().toLowerCase());
- }
- // System.out.println(clusterGeneList.size());
- setOriClusterSize(clusterGeneList.size());
- // System.out.println("original cluster size =" + clusterGeneList.size());
- // Get the mtrix (lookup table)
- lookupTable = new Matrix(GoMatrixFilename);
- // Get the bigGeneList (pool or back ground file)
- BufferedReader br = new BufferedReader(new FileReader(backGroundFilename));
- ArrayList poolOrfsName = new ArrayList(5000);
- String strRow = "";
- while ((strRow = br.readLine()) != null) {
- poolOrfsName.add(strRow.trim().toLowerCase());
- }
- this.setOriPoolOrfsName(poolOrfsName);
- poolTable = new Matrix();
- for (int i = 0; i < poolOrfsName.size(); i++) {
- Object tempKey = poolOrfsName.get(i);
- if(lookupTable.getMatrix().containsKey(tempKey)){
- poolTable.addValue(tempKey, lookupTable.getMatrix().get(tempKey));
- }
- }
- poolTable.setRowSize(poolTable.getMatrix().size());
- poolTable.setColSize(lookupTable.getColSize());
- br.close();
- // This loop is to take out any ORF from the cluster gene list if not exist in pool table
- // not necessary if all cluster ORFs are from pool table
- for (int i=0;i<refClusterGeneList.size();i++){
- Object tempKey = clusterGeneList.get(i);
- if (!poolTable.getMatrix().containsKey(tempKey)){
- clusterGeneList.remove(i);
- }
- }
- // System.out.println("length of real cluster gene List after filtering = " + clusterGeneList.size());
- // Checkpoint
- // System.out.println(clusterGeneList);
- } catch (IOException e) {
- // Catch possible io errors from readLine()
- System.out.println("IOException error in 'class GetGoID, constructor'");
- }
- // Checkpoint
- // System.out.println("Column size of pooltable is:"+ poolTable.getColSize() +'\t'+ "Row size of pooltable is :"+ poolTable.getRowSize());
- // randomFilename = "randomOrfName.txt";
- }
- public double getRawGoID() {
- double result = 0.0;
- ArrayList fullMatrix = new ArrayList(this.getPoolTable().getRowSize());
- ArrayList subMatrix = new ArrayList(this.getClusterGeneList().size());
-
- // Fill the fullMatrix with pool table data
- Iterator it = this.getPoolTable().getMatrix().keySet().iterator();
- while (it.hasNext()) {
- Object key = it.next();
- fullMatrix.add(this.getPoolTable().getMatrix().get(key.toString().toLowerCase()));
- }
- // System.out.println("size of fullMatrix is:"+ fullMatrix.size());
- // Fill the subMatrix with lookup table data and cluster information
- for (Object element : this.getClusterGeneList()) {
- if (this.getLookupTable().getMatrix().containsKey(element)) {
- subMatrix.add(this.getLookupTable().getMatrix().get(element.toString().toLowerCase()));
- }
- }
- // System.out.println("size of subMatrix is:"+ subMatrix.size());
-
- // Transpose the 2 matrix
- ArrayList attrByOrfFullMatrix = this.transpose(fullMatrix);
- ArrayList attrByOrfSubMatrix = this.transpose(subMatrix);
-
- // System.out.println("size of transposed fullMatrix is:"+ attrByOrfFullMatrix.size());
- // System.out.println("size of transposed subMatrix is:"+ attrByOrfSubMatrix.size());
-
- // Calculate the raw GoID
- for (int i = 0; i < attrByOrfFullMatrix.size(); i++) {
- // Added by tdh, from the source code, we need not do this step
- int nonZeroCount = 0;
- String[] tempArray = (String[]) attrByOrfFullMatrix.get(i);
- for (int j = 0; j < tempArray.length; j++) {
- // System.out.println(Integer.parseInt(tempArray[j].trim()));
- if (tempArray[j].trim().compareToIgnoreCase("1")==0 ) {
- // System.out.println(Integer.parseInt(tempArray[j].trim()));
- // System.out.println(tempArray[j].trim().compareToIgnoreCase("1"));//Jingyu added for checking
- nonZeroCount++;
- break; // added by tdh, from the source code, we need not do this step//Jingyu notes: the break may help the code running faster.
- // System.out.println("nonzeroCount =" + nonZeroCount);//Jingyu add
- }
- }
- // System.out.println("nonzeroCount =" + nonZeroCount);//Jingyu add
- if (nonZeroCount >= 0) {
- result = result + Information.relativeEntropy(
- ((String[]) attrByOrfSubMatrix.get(i)),
- (String[]) (attrByOrfFullMatrix.get(i)));
- }
- // System.out.println(Information.relativeEntropy(
- // ((String[]) attrByOrfSubMatrix.get(i)), (String[]) (attrByOrfFullMatrix.get(i))));
- }
- // System.out.println("result =" + result);
- return result;
- }
- private ArrayList transpose(ArrayList data) {
- ArrayList result = new ArrayList(data.size());
- // Do transpose here
- int rowSize = data.size();
- int colSize = ((String[]) data.get(0)).length;
- String[][] matrix = new String[colSize][rowSize];
- for (int i = 0; i < rowSize; i++) {
- String[] temp = (String[]) data.get(i);
- for (int j = 0; j < colSize; j++) {
- // System.out.println("j is : " + j);
- matrix[j][i] = temp[j];
- }
- }
- // Convert to ArrayList
- for (int i = 0; i < matrix.length; i++) {
- result.add(matrix[i]);
- }
- return result;
- }
- public double getRandomRawGoID() {
- double result = 0.0;
- this.setClusterGeneList(this.getRandomCluster(this.getOriClusterSize()));
- result = this.getRawGoID();
- if (Double.isNaN(result)) {
- return getRandomRawGoID();
- } else {
- return result;
- }
- }
- private void toFile(HashMap data, String filename) {
- // Output to a file
- try {
- BufferedWriter writer = new BufferedWriter(new FileWriter(filename));
- for (Object key : data.keySet()) {
- writer.write(key.toString() + "\n");
- }
- writer.close();
- } catch (Exception e) {
- System.err.println(e.getStackTrace());
- }
- }
- private static void toFileString(String data, String filename) {
- // Output to a file
- try {
- BufferedWriter writer = new BufferedWriter(new FileWriter(filename,true));
- writer.write(data + "\n");
- writer.close();
- } catch (Exception e) {
- System.err.println(e.getStackTrace());
- }
- }
- private ArrayList getRandomCluster(int clusterSize) {
- ArrayList<String> result = new ArrayList(clusterSize);
-
- // Jingyu: The following segment of code, which is deactivated, is designed to get the random cluster list from a lookuptable-filtered pooltable
- // Jingyu: To do so may cause a bias in average of random raw GOid score by using a smaller pool list
- // get a random cluster with same size of the cluster file and then calculate the Goid
- // 1, get the random orf names to a ArrayList
- // HashMap hm = new HashMap(this.getClusterGeneList().size());
- // while (hm.keySet().size() < clusterSize) {
- // hm.put(this.getPoolTable().getOrfNames().get(randInt(this.getPoolTable().getOrfNames().size())), "0");
- // }
- // result.addAll(hm.keySet());
- // Get a random cluster with same size of the cluster file from the original ORF pool
- // Extra step added by Jingyu to remove the ORFs not existing in pooltable;
- ArrayList localOriPoolTable = new ArrayList();
- localOriPoolTable = this.getOriPoolOrfsName();
- // Checkpoint
- // System.out.println(localOriPoolTable.size());
- for (int i=0;i<clusterSize;i++){
- result.add(localOriPoolTable.get(randInt(localOriPoolTable.size())).toString().trim().toLowerCase());
- }
- return result;
- }
- /**
- *
- * @param max the max integer you want to generate
- *
- * @return : a random integar between 0 and max
- */
- private int randInt(int max) {
- Random r = new Random((int) (System.nanoTime()));
- int random = r.nextInt();
- random = Math.abs(random);
- random = random % max;
- // random += 1;
- return random;
- }
- // This method is not used for the final code.
- // private String [] getZeroStringArray(int length) {
- // String [] tmpStrArray = new String[length];
- // for (int j=0; j<tmpStrArray.length; j++) {
- // tmpStrArray[j] = "0";
- // }
- // return tmpStrArray;
- // }
- }
|