GetClusters.java 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532
  1. /**
  2. * This program will take an input file(either in arff format or csv format).
  3. * outout 3 files: one is the tree structure. another is the final table with
  4. * all information, the last one is the summary information
  5. *
  6. */
  7. import java.io.BufferedReader;
  8. import java.io.File;
  9. import java.io.FileNotFoundException;
  10. import java.io.FileOutputStream;
  11. import java.io.FileReader;
  12. import java.io.IOException;
  13. import java.io.OutputStreamWriter;
  14. import java.util.Vector;
  15. import weka.clusterers.ClusterEvaluation;
  16. import weka.clusterers.EM;
  17. import weka.core.Instances;
  18. public class GetClusters {
  19. public GetClusters() {
  20. }
  21. /**
  22. *
  23. * @param root the tree node we need to cluster
  24. * @param generation the depth of the tree
  25. * @param position the breadth of the tree
  26. * @param vecFinalTable contain the final table
  27. */
  28. public int clustering(TreeNode root, int generation,
  29. String position, Vector vecFinalTable,
  30. Vector vecSummary, String outputFilename, OutputStreamWriter xmlWriter,
  31. String lookupFile, String backgroundFile, String count, boolean fromFunction) {
  32. int result = 0;
  33. try {
  34. FileOutputStream stream;// provides file access
  35. OutputStreamWriter writer;// writes to the file
  36. stream = new FileOutputStream(new File(outputFilename), true);
  37. writer = new OutputStreamWriter(stream);
  38. // ***** 1 create a copy of original data *****
  39. Instances oriData = root.getData();
  40. Instances data = new Instances(oriData);
  41. // ***** 2 remove attribute: orf_name(string attribute) *****
  42. // data.deleteAttributeAt(0);
  43. data.deleteStringAttributes();
  44. // ***** 3 clustering *****
  45. EM clusterer = new EM(); // new instance of clusterer
  46. clusterer.buildClusterer(data); // build the clusterer
  47. // evaluate cluster
  48. ClusterEvaluation eval = new ClusterEvaluation();
  49. eval.setClusterer(clusterer); // the cluster to evaluate
  50. eval.evaluateClusterer(data); // data to evaluate the clusterer on
  51. //get the rawGoID and zScore for the
  52. //to be continued. AAA
  53. String[] clusterNames = getClusterNames(oriData);
  54. double[] goID = null;
  55. if(fromFunction){
  56. goID = this.getGoID(clusterNames, lookupFile, backgroundFile,count);
  57. }else{
  58. goID = this.getGoIDFromFunc(clusterNames, lookupFile, backgroundFile,count);
  59. }
  60. double logLikelihood = eval.getLogLikelihood();
  61. writer.write("logLikelihood is: " + logLikelihood + "\n");
  62. writer.write("GoID is: " + goID[0] + "\n");
  63. writer.write("zScore is: " + goID[1] + "\n\n");
  64. writer.flush();
  65. // ***** 4 get the sub clusters *****
  66. int numberOfSubCluster = eval.getNumClusters();
  67. if (numberOfSubCluster > 1) {// not an end node
  68. // create numberOfSubCluster instances array to store sub
  69. // clusters
  70. Instances[] subData = new Instances[numberOfSubCluster];
  71. TreeNode[] subNode = new TreeNode[numberOfSubCluster];
  72. for (int i = 0; i < numberOfSubCluster; i++) {
  73. subData[i] = new Instances(oriData);
  74. subData[i].delete();// keep only data head(attributes part)
  75. }
  76. // //System.out.println("\nlength is: " + data.numInstances());
  77. // //System.out.println("number of clusters: " +
  78. // numberOfSubCluster);
  79. // //System.out.println(eval.clusterResultsToString());
  80. double[] dArray = eval.getClusterAssignments();
  81. for (int i = 0; i < dArray.length; i++) {
  82. int clusterNumber = (int) dArray[i];
  83. // //System.out.println("\ngene " + i + " is in cluster: "
  84. // + clusterNumber + ",\tlog likelihood is:"
  85. // + eval.getLogLikelihood());
  86. // //System.out.println("***************");
  87. // assign each gene to according cluster
  88. for (int j = 0; j < subData.length; j++) {
  89. if (j == clusterNumber) {
  90. subData[j].add(oriData.instance(i));
  91. }
  92. }// end of inner j loop
  93. }// end of outter i loop
  94. // ***** 5 recursive call *****
  95. String uniName = "";
  96. // for (int i = 0; i <= generation; i++) {
  97. // uniName += "0";
  98. // }
  99. uniName += generation + "-" + position;
  100. generation++;
  101. for (int i = 0; i < numberOfSubCluster; i++) {
  102. String name = uniName + "-" + i;
  103. //System.out.println("\n******************************");
  104. //System.out.println("cluster name: " + name);
  105. writer.write("\n******************************\n");
  106. writer.write("cluster name: " + name + "\n");
  107. writer.flush();
  108. xmlWriter.write(" <branch>\n <attribute name=\"name\" value=\"" + name + "\"/>\n");
  109. xmlWriter.flush();
  110. subNode[i] = new TreeNode(name, eval.getLogLikelihood(),
  111. subData[i], root);
  112. result += clustering(subNode[i], generation,
  113. position + "." + i, vecFinalTable, vecSummary, outputFilename,
  114. xmlWriter,lookupFile,backgroundFile,count,fromFunction);
  115. xmlWriter.write(" </branch>\n");
  116. xmlWriter.flush();
  117. }// end of for loop
  118. } else { //for leaf node
  119. //System.out.println("leaf node");
  120. result = 1;
  121. int temp = 1;
  122. if (!vecSummary.isEmpty()) {
  123. String strT = (vecSummary.lastElement().toString()).split(",")[1];
  124. temp = Integer.parseInt(strT.trim()) + 1;
  125. }
  126. writer.write("leaf node\n");
  127. writer.flush();
  128. for (int i = 0; i < root.getData().numInstances(); i++) {
  129. String strTemp = eval.getLogLikelihood() + "," + root.getData().instance(i) + "," + getAncestor(root, false) + "," + temp;
  130. //System.out.println( strTemp);
  131. writer.write(strTemp + "\n");
  132. writer.flush();
  133. xmlWriter.write("<leaf>\n <attribute name=\"name\" value=\"" + root.getData().instance(i).stringValue(0) + "\"/>\n</leaf>\n");
  134. xmlWriter.flush();
  135. vecFinalTable.addElement(strTemp);
  136. }
  137. vecSummary.addElement(getAncestor(root, false).toString() + "," + temp + "," + root.getData().numInstances() + "," + logLikelihood);
  138. //System.out.println("******************************\n");
  139. writer.write("******************************\n");
  140. writer.flush();
  141. generation--;
  142. }//end of else
  143. writer.close();
  144. stream.close();
  145. } catch (Exception e) {
  146. // TODO Auto-generated catch block
  147. e.printStackTrace();
  148. }
  149. return result;
  150. }//end of method "clustering"
  151. /**
  152. * output the root cluster name to file
  153. * @param fileName output file name
  154. * @param rootName thr root cluster name
  155. */
  156. public void printRootName(String fileName, String rootName) {
  157. try {
  158. FileOutputStream stream;// provides file access
  159. OutputStreamWriter writer;// writes to the file
  160. stream = new FileOutputStream(new File(fileName), true);
  161. writer = new OutputStreamWriter(stream);
  162. writer.write("root cluster is:" + rootName + "\n");
  163. writer.flush();
  164. writer.close();
  165. stream.close();
  166. } catch (FileNotFoundException e) {
  167. // TODO Auto-generated catch block
  168. e.printStackTrace();
  169. } catch (IOException e) {
  170. // TODO Auto-generated catch block
  171. e.printStackTrace();
  172. }
  173. }
  174. /**
  175. * print out the instance part of the data into a CSV formated table.
  176. *
  177. * @param data: the printed data set
  178. */
  179. public String printTableHead(Instances data) {
  180. String strResult = "likelihood";
  181. for (int i = 0; i < data.numAttributes(); i++) {
  182. String strTemp = "";
  183. String[] strArr = data.attribute(i).toString().split("\\ ");
  184. for (int j = 1; j < strArr.length - 1; j++) {
  185. strTemp += strArr[j];
  186. }
  187. strResult += "," + strTemp;
  188. }
  189. return strResult + ",cluster origin,cluster ID";
  190. }//end of method "printTalbe"
  191. /**
  192. * print the vector
  193. * @param vec
  194. */
  195. public void printVector(Vector vec, String outputFilename) {
  196. //System.out.println("\n***************************");
  197. //System.out.println("*** final result ***");
  198. //System.out.println("***************************");
  199. try {
  200. FileOutputStream stream;// provides file access
  201. OutputStreamWriter writer;// writes to the file
  202. stream = new FileOutputStream(new File(outputFilename), false);
  203. writer = new OutputStreamWriter(stream);
  204. for (int i = 0; i < vec.size(); i++) {
  205. //System.out.println(vec.elementAt(i));
  206. writer.write(vec.elementAt(i).toString() + "\n");
  207. }
  208. writer.close();
  209. stream.close();
  210. } catch (FileNotFoundException e) {
  211. // TODO Auto-generated catch block
  212. e.printStackTrace();
  213. } catch (IOException e) {
  214. // TODO Auto-generated catch block
  215. e.printStackTrace();
  216. }
  217. //System.out.println("\n***************************");
  218. //System.out.println("*** end of final result ***");
  219. //System.out.println("***************************");
  220. }
  221. /**
  222. *
  223. * @param endNode an leaf node
  224. * @return a string contains all the ancestor's name of the node
  225. */
  226. public String getAncestor(TreeNode endNode, boolean fromLeafNode) {
  227. String strResult = endNode.getStrName();
  228. TreeNode tempNode = endNode;
  229. while (tempNode.getParent() != null) {
  230. tempNode = tempNode.getParent();
  231. strResult += "; " + tempNode.getStrName();
  232. }
  233. if (fromLeafNode) {
  234. return strResult;
  235. } else {
  236. String newResult = "";
  237. String[] history = strResult.split("\\;");
  238. for (int i = history.length; i > 0; i--) {
  239. newResult += history[i - 1] + "; ";
  240. }
  241. return newResult;
  242. }
  243. }
  244. /**
  245. * check the number of the arguments:
  246. * java GetCluster arg1 arg2 ...
  247. *
  248. * @param length the length of the arguments
  249. * in this program, length should be 1
  250. */
  251. public void checkParameters(int length) {
  252. if (length != 1) {
  253. System.out.println("Usage: java GetCluster inputFileName");
  254. System.exit(1);
  255. }
  256. }
  257. /**
  258. *
  259. * @param inputFileName the name of the input file name
  260. * @return an Instances of Weka Instances
  261. */
  262. public Instances input(String inputFileName) {
  263. String[] inputName = inputFileName.split("\\.");
  264. Instances oriData = null;
  265. try {
  266. if (inputName[inputName.length - 1].compareToIgnoreCase("csv") == 0) {
  267. // read from csv file
  268. readCSV(inputFileName);
  269. FileReader f = new FileReader(inputFileName + ".arff");
  270. BufferedReader b = new BufferedReader(f);
  271. oriData = new Instances(b);
  272. } else if (inputName[inputName.length - 1].compareToIgnoreCase("arff") == 0) {
  273. // read from arff data
  274. FileReader f = new FileReader(inputFileName);
  275. BufferedReader b = new BufferedReader(f);
  276. oriData = new Instances(b);
  277. } else {
  278. System.out.println("only .arff or .csv format allowed!");
  279. System.exit(1);
  280. }
  281. } catch (FileNotFoundException e) {
  282. // TODO Auto-generated catch block
  283. e.printStackTrace();
  284. } catch (IOException e) {
  285. // TODO Auto-generated catch block
  286. e.printStackTrace();
  287. }
  288. return oriData;
  289. }
  290. /**
  291. * read a csv file and convert to a arff file
  292. * @param inputName the name of the csv file
  293. */
  294. public void readCSV(String inputName) {
  295. try {
  296. FileReader fr = new FileReader(inputName);
  297. BufferedReader br = new BufferedReader(fr);
  298. FileOutputStream stream;// provides file access
  299. OutputStreamWriter writer;// writes to the file
  300. stream = new FileOutputStream(new File(inputName + ".arff"), false);
  301. writer = new OutputStreamWriter(stream);
  302. String strLine = br.readLine();
  303. String[] varNameArray = strLine.split("\\,");
  304. writer.write("@RELATION dataset" + "\n\n");
  305. for (int i = 0; i < varNameArray.length; i++) {
  306. if (i < 2) {
  307. writer.write("@ATTRIBUTE" + " " + "\"" + varNameArray[i] + "\"" + " " + "string" + "\n");
  308. } else {
  309. writer.write("@ATTRIBUTE" + " " + "\"" + varNameArray[i] + "\"" + " " + "numeric" + "\n");
  310. }
  311. }
  312. writer.write("\n@DATA\n");
  313. while ((strLine = br.readLine()) != null) {
  314. writer.write(strLine + "\n");
  315. }
  316. writer.close();
  317. stream.close();
  318. fr.close();
  319. br.close();
  320. } catch (FileNotFoundException e) {
  321. // TODO Auto-generated catch block
  322. e.printStackTrace();
  323. } catch (IOException e) {
  324. // TODO Auto-generated catch block
  325. e.printStackTrace();
  326. }
  327. }
  328. /**
  329. *
  330. * @param data
  331. * @return an array contains the first element
  332. * of each instance of input data
  333. */
  334. private String[] getClusterNames(Instances data) {
  335. String[] result = new String[data.numInstances()];
  336. for (int i = 0; i < result.length; i++) {
  337. String[] strArray = data.instance(i).toString().split("\\,");
  338. result[i] = strArray[0];
  339. }
  340. return result;
  341. }
  342. private double[] getGoID(String[] clusterNames, String lookupFile, String backgroundFile, String count) {
  343. //********************************
  344. // part 2, calculate RawGoID
  345. //********************************
  346. double[] result = new double[2];
  347. //initialize local variables:
  348. RawGoID myRawGoID = new RawGoID(clusterNames, lookupFile, backgroundFile);
  349. double clusterGoid = myRawGoID.getRawGoID();
  350. double randomAve = 0.0;
  351. double randomStd = 0.0;
  352. double zScore = 0.0;
  353. // System.out.println("real cluster raw GOid =" + clusterGoid);
  354. // get 'repeat time' random rawGoIDs
  355. double[] randomGoid = new double[Integer.parseInt(count)];
  356. for (int i = 0; i < Integer.parseInt(count); i++) {
  357. randomGoid[i] = myRawGoID.getRandomRawGoID();
  358. // System.out.println("now is in loop :" + (i + 1));
  359. // System.out.println("randomGOid = " + randomGoid[i]);
  360. }
  361. //calculate
  362. randomAve = Stats.getMean(randomGoid);
  363. randomStd = Stats.getStdDev(randomGoid);
  364. zScore = Stats.getZscore(randomAve, randomStd, clusterGoid);
  365. result[0] = clusterGoid;
  366. result[1] = zScore;
  367. return result;
  368. }
  369. private double[] getGoIDFromFunc(String[] clusterNames, String lookupFile, String backgroundFile, String count) {
  370. //********************************
  371. // part 2, calculate RawGoID
  372. //********************************
  373. double[] result = new double[2];
  374. //initialize local variables:
  375. RawGoID myRawGoID = new RawGoID(clusterNames, lookupFile, backgroundFile);
  376. double clusterGoid = myRawGoID.getRawGoID();
  377. double randomAve = 0.0;
  378. double randomStd = 0.0;
  379. double zScore = 0.0;
  380. // System.out.println("real cluster raw GOid =" + clusterGoid);
  381. // get 'repeat time' random rawGoIDs
  382. double[] randomGoid = new double[Integer.parseInt(count)];
  383. for (int i = 0; i < Integer.parseInt(count); i++) {
  384. randomGoid[i] = myRawGoID.getRandomRawGoID();
  385. // System.out.println("now is in loop :" + (i + 1));
  386. // System.out.println("randomGOid = " + randomGoid[i]);
  387. }
  388. //calculate
  389. randomAve = Stats.getMeanFromFunc(myRawGoID.getOriClusterSize());
  390. randomStd = Stats.getStdDevFromFunc(myRawGoID.getOriClusterSize());
  391. zScore = Stats.getZscore(randomAve, randomStd, clusterGoid);
  392. result[0] = clusterGoid;
  393. result[1] = zScore;
  394. return result;
  395. }
  396. }//end of class
  397. final class TreeNode {
  398. private String strName;
  399. private double dLikelihood;
  400. private Instances data;
  401. private TreeNode parent;
  402. // TreeNode child;
  403. /**
  404. * @param strName name of node
  405. * @param likelihood likelihood of the data
  406. * @param data data set
  407. * @param parent point to its parent node
  408. * @param child point to its child node
  409. */
  410. public TreeNode(String strName, double likelihood, Instances data, TreeNode parent) {
  411. this.strName = strName;
  412. dLikelihood = likelihood;
  413. this.data = data;
  414. this.parent = parent;
  415. }
  416. /**
  417. * @return the data
  418. */
  419. public Instances getData() {
  420. return data;
  421. }
  422. /**
  423. * @param data the data to set
  424. */
  425. public void setData(Instances data) {
  426. this.data = data;
  427. }
  428. /**
  429. * @return the dLikelihood
  430. */
  431. public double getDLikelihood() {
  432. return dLikelihood;
  433. }
  434. /**
  435. * @param likelihood the dLikelihood to set
  436. */
  437. public void setDLikelihood(double likelihood) {
  438. dLikelihood = likelihood;
  439. }
  440. /**
  441. * @return the parent
  442. */
  443. public TreeNode getParent() {
  444. return parent;
  445. }
  446. /**
  447. * @param parent the parent to set
  448. */
  449. public void setParent(TreeNode parent) {
  450. this.parent = parent;
  451. }
  452. /**
  453. * @return the strName
  454. */
  455. public String getStrName() {
  456. return strName;
  457. }
  458. /**
  459. * @param strName the strName to set
  460. */
  461. public void setStrName(String strName) {
  462. this.strName = strName;
  463. }
  464. }