RawGoID.java 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375
  1. /*
  2. * the input: 3 files, 1 is cluster file, 2 is Go matrix file (lookup table)
  3. * 3 is back ground file(pool)
  4. *
  5. *
  6. */
  7. import java.io.BufferedReader;
  8. import java.io.BufferedWriter;
  9. import java.io.FileReader;
  10. import java.io.FileWriter;
  11. import java.io.IOException;
  12. import java.util.ArrayList;
  13. import java.util.HashMap;
  14. import java.util.Iterator;
  15. import java.util.Random;
  16. /**
  17. *
  18. * @author DTian
  19. */
  20. public class RawGoID {
  21. private ArrayList clusterGeneList; // for the input cluster file
  22. private Matrix poolTable; //for the filtered gene pool list
  23. private Matrix lookupTable; // for the lookup attribute table
  24. private int oriClusterSize; //for the original cluster size
  25. private ArrayList oriPoolOrfsName;//for the complete list of pool table
  26. // private String randomFilename;
  27. public Matrix getLookupTable() {
  28. return lookupTable;
  29. }
  30. public void setLookupTable(Matrix lookupTable) {
  31. this.lookupTable = lookupTable;
  32. }
  33. public Matrix getPoolTable() {
  34. return poolTable;
  35. }
  36. public void setPoolTable(Matrix poolTable) {
  37. this.poolTable = poolTable;
  38. }
  39. public ArrayList getClusterGeneList() {
  40. return clusterGeneList;
  41. }
  42. public void setClusterGeneList(ArrayList clusterGeneList) {
  43. this.clusterGeneList = clusterGeneList;
  44. }
  45. public RawGoID() {
  46. clusterGeneList = new ArrayList();
  47. poolTable = new Matrix();
  48. lookupTable = new Matrix();
  49. // randomFilename ="";
  50. }
  51. public void setOriClusterSize(int oriClusterSize) {
  52. this.oriClusterSize = oriClusterSize;
  53. }
  54. public int getOriClusterSize () {
  55. return oriClusterSize;
  56. }
  57. public void setOriPoolOrfsName(ArrayList oriPoolOrfsName) {
  58. this.oriPoolOrfsName = oriPoolOrfsName;
  59. }
  60. public ArrayList getOriPoolOrfsName() {
  61. return oriPoolOrfsName;
  62. }
  63. /**
  64. *
  65. * @param clusterFilename : cluster Filename
  66. * @param GoMatrixFilename : GoMatrix Filename
  67. * @param backGroundFilename : backGround Filename
  68. */
  69. public RawGoID(String clusterFilename, String GoMatrixFilename, String backGroundFilename) {
  70. try {
  71. clusterGeneList = new ArrayList(200);
  72. ArrayList refClusterGeneList = new ArrayList (200);
  73. // Get the smallGeneList (a cluster )
  74. BufferedReader br = new BufferedReader(new FileReader(clusterFilename));
  75. // strRow is used to read line from file
  76. String strRow = "";
  77. while ((strRow = br.readLine()) != null) {
  78. clusterGeneList.add(strRow.trim().toLowerCase());
  79. }
  80. // System.out.println(clusterGeneList.size());
  81. setOriClusterSize(clusterGeneList.size());
  82. // System.out.println("original cluster size =" + clusterGeneList.size());
  83. // Get the mtrix (lookup table)
  84. lookupTable = new Matrix(GoMatrixFilename);
  85. // Get the bigGeneList (pool or back ground file)
  86. br = new BufferedReader(new FileReader(backGroundFilename));
  87. ArrayList poolOrfsName = new ArrayList(5000);
  88. while ((strRow = br.readLine()) != null) {
  89. poolOrfsName.add(strRow.trim().toLowerCase());
  90. }
  91. this.setOriPoolOrfsName(poolOrfsName);
  92. poolTable = new Matrix();
  93. for (int i = 0; i < poolOrfsName.size(); i++) {
  94. Object tempKey = poolOrfsName.get(i);
  95. if (lookupTable.getMatrix().containsKey(tempKey)) {
  96. poolTable.addValue(tempKey, lookupTable.getMatrix().get(tempKey));
  97. }
  98. }
  99. poolTable.setRowSize(poolTable.getMatrix().size());
  100. poolTable.setColSize(lookupTable.getColSize());
  101. br.close();
  102. // This loop is to take out any ORF from the cluster gene list if not exist in pool table
  103. // not necessary if all cluster ORFs are from pool table
  104. for (int i=0;i<refClusterGeneList.size();i++){
  105. Object tempKey = clusterGeneList.get(i);
  106. if (!poolTable.getMatrix().containsKey(tempKey)){
  107. clusterGeneList.remove(i);
  108. }
  109. }
  110. // System.out.println("length of real cluster gene List after filtering = " + clusterGeneList.size());
  111. // Check point
  112. // System.out.println(clusterGeneList);
  113. } catch (IOException e) {
  114. // Catch possible io errors from readLine()
  115. System.out.println("IOException error in 'class GetGoID, constructor'");
  116. }
  117. // Checkpoint
  118. // System.out.println("Column size of pooltable is:"+ poolTable.getColSize() +'\t'+ "Row size of pooltable is :"+ poolTable.getRowSize());
  119. // randomFilename = "randomOrfName.txt";
  120. }
  121. /**
  122. *
  123. * @param clusterFilename : cluster Filename
  124. * @param GoMatrixFilename : GoMatrix Filename
  125. * @param backGroundFilename : backGround Filename
  126. */
  127. public RawGoID(String[] clusterName, String GoMatrixFilename, String backGroundFilename) {
  128. try {
  129. clusterGeneList = new ArrayList(clusterName.length);
  130. ArrayList refClusterGeneList = new ArrayList (200);
  131. // Get the smallGeneList (a cluster )
  132. for(String name: clusterName){
  133. clusterGeneList.add(name.trim().toLowerCase());
  134. }
  135. // System.out.println(clusterGeneList.size());
  136. setOriClusterSize(clusterGeneList.size());
  137. // System.out.println("original cluster size =" + clusterGeneList.size());
  138. // Get the mtrix (lookup table)
  139. lookupTable = new Matrix(GoMatrixFilename);
  140. // Get the bigGeneList (pool or back ground file)
  141. BufferedReader br = new BufferedReader(new FileReader(backGroundFilename));
  142. ArrayList poolOrfsName = new ArrayList(5000);
  143. String strRow = "";
  144. while ((strRow = br.readLine()) != null) {
  145. poolOrfsName.add(strRow.trim().toLowerCase());
  146. }
  147. this.setOriPoolOrfsName(poolOrfsName);
  148. poolTable = new Matrix();
  149. for (int i = 0; i < poolOrfsName.size(); i++) {
  150. Object tempKey = poolOrfsName.get(i);
  151. if(lookupTable.getMatrix().containsKey(tempKey)){
  152. poolTable.addValue(tempKey, lookupTable.getMatrix().get(tempKey));
  153. }
  154. }
  155. poolTable.setRowSize(poolTable.getMatrix().size());
  156. poolTable.setColSize(lookupTable.getColSize());
  157. br.close();
  158. // This loop is to take out any ORF from the cluster gene list if not exist in pool table
  159. // not necessary if all cluster ORFs are from pool table
  160. for (int i=0;i<refClusterGeneList.size();i++){
  161. Object tempKey = clusterGeneList.get(i);
  162. if (!poolTable.getMatrix().containsKey(tempKey)){
  163. clusterGeneList.remove(i);
  164. }
  165. }
  166. // System.out.println("length of real cluster gene List after filtering = " + clusterGeneList.size());
  167. // Checkpoint
  168. // System.out.println(clusterGeneList);
  169. } catch (IOException e) {
  170. // Catch possible io errors from readLine()
  171. System.out.println("IOException error in 'class GetGoID, constructor'");
  172. }
  173. // Checkpoint
  174. // System.out.println("Column size of pooltable is:"+ poolTable.getColSize() +'\t'+ "Row size of pooltable is :"+ poolTable.getRowSize());
  175. // randomFilename = "randomOrfName.txt";
  176. }
  177. public double getRawGoID() {
  178. double result = 0.0;
  179. ArrayList fullMatrix = new ArrayList(this.getPoolTable().getRowSize());
  180. ArrayList subMatrix = new ArrayList(this.getClusterGeneList().size());
  181. // Fill the fullMatrix with pool table data
  182. Iterator it = this.getPoolTable().getMatrix().keySet().iterator();
  183. while (it.hasNext()) {
  184. Object key = it.next();
  185. fullMatrix.add(this.getPoolTable().getMatrix().get(key.toString().toLowerCase()));
  186. }
  187. // System.out.println("size of fullMatrix is:"+ fullMatrix.size());
  188. // Fill the subMatrix with lookup table data and cluster information
  189. for (Object element : this.getClusterGeneList()) {
  190. if (this.getLookupTable().getMatrix().containsKey(element)) {
  191. subMatrix.add(this.getLookupTable().getMatrix().get(element.toString().toLowerCase()));
  192. }
  193. }
  194. // System.out.println("size of subMatrix is:"+ subMatrix.size());
  195. // Transpose the 2 matrix
  196. ArrayList attrByOrfFullMatrix = this.transpose(fullMatrix);
  197. ArrayList attrByOrfSubMatrix = this.transpose(subMatrix);
  198. // System.out.println("size of transposed fullMatrix is:"+ attrByOrfFullMatrix.size());
  199. // System.out.println("size of transposed subMatrix is:"+ attrByOrfSubMatrix.size());
  200. // Calculate the raw GoID
  201. for (int i = 0; i < attrByOrfFullMatrix.size(); i++) {
  202. // Added by tdh, from the source code, we need not do this step
  203. int nonZeroCount = 0;
  204. String[] tempArray = (String[]) attrByOrfFullMatrix.get(i);
  205. for (int j = 0; j < tempArray.length; j++) {
  206. // System.out.println(Integer.parseInt(tempArray[j].trim()));
  207. if (tempArray[j].trim().compareToIgnoreCase("1")==0 ) {
  208. // System.out.println(Integer.parseInt(tempArray[j].trim()));
  209. // System.out.println(tempArray[j].trim().compareToIgnoreCase("1"));//Jingyu added for checking
  210. nonZeroCount++;
  211. break; // added by tdh, from the source code, we need not do this step//Jingyu notes: the break may help the code running faster.
  212. // System.out.println("nonzeroCount =" + nonZeroCount);//Jingyu add
  213. }
  214. }
  215. // System.out.println("nonzeroCount =" + nonZeroCount);//Jingyu add
  216. if (nonZeroCount >= 0) {
  217. result = result + Information.relativeEntropy(
  218. ((String[]) attrByOrfSubMatrix.get(i)),
  219. (String[]) (attrByOrfFullMatrix.get(i)));
  220. }
  221. // System.out.println(Information.relativeEntropy(
  222. // ((String[]) attrByOrfSubMatrix.get(i)), (String[]) (attrByOrfFullMatrix.get(i))));
  223. }
  224. // System.out.println("result =" + result);
  225. return result;
  226. }
  227. private ArrayList transpose(ArrayList data) {
  228. ArrayList result = new ArrayList(data.size());
  229. // Do transpose here
  230. int rowSize = data.size();
  231. int colSize = ((String[]) data.get(0)).length;
  232. String[][] matrix = new String[colSize][rowSize];
  233. for (int i = 0; i < rowSize; i++) {
  234. String[] temp = (String[]) data.get(i);
  235. for (int j = 0; j < colSize; j++) {
  236. // System.out.println("j is : " + j);
  237. matrix[j][i] = temp[j];
  238. }
  239. }
  240. // Convert to ArrayList
  241. for (int i = 0; i < matrix.length; i++) {
  242. result.add(matrix[i]);
  243. }
  244. return result;
  245. }
  246. public double getRandomRawGoID() {
  247. double result = 0.0;
  248. this.setClusterGeneList(this.getRandomCluster(this.getOriClusterSize()));
  249. result = this.getRawGoID();
  250. if (Double.isNaN(result)) {
  251. return getRandomRawGoID();
  252. } else {
  253. return result;
  254. }
  255. }
  256. private void toFile(HashMap data, String filename) {
  257. // Output to a file
  258. try {
  259. BufferedWriter writer = new BufferedWriter(new FileWriter(filename));
  260. for (Object key : data.keySet()) {
  261. writer.write(key.toString() + "\n");
  262. }
  263. writer.close();
  264. } catch (Exception e) {
  265. System.err.println(e.getStackTrace());
  266. }
  267. }
  268. private static void toFileString(String data, String filename) {
  269. // Output to a file
  270. try {
  271. BufferedWriter writer = new BufferedWriter(new FileWriter(filename,true));
  272. writer.write(data + "\n");
  273. writer.close();
  274. } catch (Exception e) {
  275. System.err.println(e.getStackTrace());
  276. }
  277. }
  278. private ArrayList getRandomCluster(int clusterSize) {
  279. ArrayList<String> result = new ArrayList(clusterSize);
  280. // Jingyu: The following segment of code, which is deactivated, is designed to get the random cluster list from a lookuptable-filtered pooltable
  281. // Jingyu: To do so may cause a bias in average of random raw GOid score by using a smaller pool list
  282. // get a random cluster with same size of the cluster file and then calculate the Goid
  283. // 1, get the random orf names to a ArrayList
  284. // HashMap hm = new HashMap(this.getClusterGeneList().size());
  285. // while (hm.keySet().size() < clusterSize) {
  286. // hm.put(this.getPoolTable().getOrfNames().get(randInt(this.getPoolTable().getOrfNames().size())), "0");
  287. // }
  288. // result.addAll(hm.keySet());
  289. // Get a random cluster with same size of the cluster file from the original ORF pool
  290. // Extra step added by Jingyu to remove the ORFs not existing in pooltable;
  291. ArrayList localOriPoolTable = new ArrayList();
  292. localOriPoolTable = this.getOriPoolOrfsName();
  293. // Checkpoint
  294. // System.out.println(localOriPoolTable.size());
  295. for (int i=0;i<clusterSize;i++){
  296. result.add(localOriPoolTable.get(randInt(localOriPoolTable.size())).toString().trim().toLowerCase());
  297. }
  298. return result;
  299. }
  300. /**
  301. *
  302. * @param max the max integer you want to generate
  303. *
  304. * @return : a random integar between 0 and max
  305. */
  306. private int randInt(int max) {
  307. Random r = new Random((int) (System.nanoTime()));
  308. int random = r.nextInt();
  309. random = Math.abs(random);
  310. random = random % max;
  311. // random += 1;
  312. return random;
  313. }
  314. // This method is not used for the final code.
  315. // private String [] getZeroStringArray(int length) {
  316. // String [] tmpStrArray = new String[length];
  317. // for (int j=0; j<tmpStrArray.length; j++) {
  318. // tmpStrArray[j] = "0";
  319. // }
  320. // return tmpStrArray;
  321. // }
  322. }