First attempt at script-run-workflow

This commit is contained in:
2024-07-21 23:50:24 -04:00
parent 628356652d
commit 06dd700680
290 changed files with 5524411 additions and 0 deletions

Binary file not shown.

View File

@@ -0,0 +1 @@
,jwrodger,hartmanlab.genetics.uab.edu,28.07.2022 09:10,file:///home/jwrodger/.config/libreoffice/4;

View File

@@ -0,0 +1,71 @@
#!/usr/bin/env python
# This code is to concatenate the batch GO Term Finder results (.tsv) generated from batch GTF perl code(Chris Johnson, U of Tulsa) into a list table
import sys, os, string, glob
# return the file list
def list_all_files(Path):
list_all_files = []
list_all_files = glob.glob(Path +'/*.txt.tsv')
return list_all_files
# Main function
try:
data_file_Path = sys.argv[1]
output_file_Path = sys.argv[2]
except:
print ('Usage: python Concatenate_GTF_results.py /datasetPath /outputFilePath_and_Name')
print ('Data file not found, error in given directory')
sys.exit(1)
# Open the output file
try:
output = open(output_file_Path, 'w')
except OSError:
print ('output file error')
# get all the GTF result files in given directory
File_list = []
File_list = list_all_files(data_file_Path)
File_list.sort()
i = 0
for file in File_list:
#parse the file names given in absolute path
file_name = file.strip().split('/')[-1]
file_name = file_name.rstrip('.txt.tsv')
# function to read tsv files from a given directory
#open the file
data = open(file,'r')
#reading the label line
labelLine = data.readline()
label = labelLine.strip().split('\t')
#write the label
#updates2010July26: update following label writing code
if i == 0:
# output.write('cluster origin')
for element in label:
output.write(element)
output.write('\t')
i = i + 1
#updates2010July26 End
#switch to the next line
output.write('\n')
#read the GO terms
GOTermLines = data.readlines()
for GOTerm in GOTermLines:
GOTerm = GOTerm.strip().strip('\t')
if GOTerm != '':
#updates2010July26: remove the code to write the first column 'REMc cluster ID'
#output.write(file_name)
#output.write('\t')
##updates2010July26 update end
output.write(GOTerm + '\n')
#output.write('\n')
output.close()

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,390 @@
#!/usr/bin/perl
# $Id: analyze.pl,v 1.9 2008/05/14 20:45:37 sherlock Exp $
# Date : 16th October 2003
# Author : Gavin Sherlock
# License information (the MIT license)
# Copyright (c) 2003 Gavin Sherlock; Stanford University
# Permission is hereby granted, free of charge, to any person
# obtaining a copy of this software and associated documentation files
# (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge,
# publish, distribute, sublicense, and/or sell copies of the Software,
# and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
use strict;
use warnings;
use diagnostics;
use Data::Dumper;
use Getopt::Long;
use IO::File;
use GO::TermFinder;
use GO::AnnotationProvider::AnnotationParser;
use GO::OntologyProvider::OboParser;
use GO::TermFinderReport::Text;
use GO::Utils::File qw (GenesFromFile);
use GO::Utils::General qw (CategorizeGenes);
$|=1;
###################################################################################
sub Usage{
###################################################################################
my $message = shift;
if (defined $message){
print $message, "\n";
}
print <<USAGE;
This program takes a list of files, each of which contain a list of
genes, with one gene per line. It will findTerms for the lists of
genes in each of the GO aspects, outputting the results to a file
named for the original file, but with a .terms extension. It will only
output terms with a corrected P-value of <= 0.05.
It will use the first supplied argument as the annotation file, the
second argument as the expected number of genes within the organism,
the third argument is the name of the obo file, and all subsequent
files as ones containing lists of genes.
Usage:
analyze.pl <annotation_file> <numGenes> <obofile> <file1> <file2> <file3> ... <fileN>
e.g.
analyze.pl -a ../t/gene_association.sgd -n 7200 -o ../t/gene_ontology_edit.obo genes.txt genes2.txt
USAGE
exit;
}
# we need at least 3 arguments, an annotation file, the number of
# genes in the genome, and a file of input genes to test
&Usage if (@ARGV < 3);
# now get our annotation file and number of genes
my $annotationFile = '';
my $totalNum = '';
my $oboFile = '';
my $background = '';
my $aspect = '';
GetOptions( "annotations=s" => \$annotationFile,
"obofile=s" => \$oboFile,
"background=s" => \$background,
"numGenes=i" => \$totalNum,
"aspect=s" => \$aspect
);
if ($oboFile !~ /\.obo$/){
# require the obo file to have a .obo extension
&Usage("Your obo file does not have a .obo extension.");
}
if ($annotationFile !~ /\.sgd$/){
&Usage("Perhaps we are missing an annotation file.");
}
my @population = ();
if ($background) {
@population = GenesFromFile($background)
}
# now set up the objects we need
my $process = GO::OntologyProvider::OboParser->new(ontologyFile => $oboFile,
aspect => 'P');
my $component = GO::OntologyProvider::OboParser->new(ontologyFile => $oboFile,
aspect => 'C');
my $function = GO::OntologyProvider::OboParser->new(ontologyFile => $oboFile,
aspect => 'F');
my $annotation = GO::AnnotationProvider::AnnotationParser->new(annotationFile=>$annotationFile);
my @termFinders = ();
if ($background) {
if ($aspect =~ /^P$|^$/) {
push @termFinders, GO::TermFinder->new(annotationProvider=> $annotation,
ontologyProvider => $process,
population => \@population,
aspect => 'P');
}
if ($aspect =~ /^C$|^$/) {
push @termFinders, GO::TermFinder->new(annotationProvider=> $annotation,
ontologyProvider => $component,
population => \@population,
aspect => 'C');
}
if ($aspect =~ /^F$|^$/) {
push @termFinders, GO::TermFinder->new(annotationProvider=> $annotation,
ontologyProvider => $function,
population => \@population,
aspect => 'F');
}
} else {
if ($aspect =~ /^P$|^$/) {
push @termFinders, GO::TermFinder->new(annotationProvider=> $annotation,
ontologyProvider => $process,
totalNumGenes => $totalNum,
aspect => 'P');
}
if ($aspect =~ /^C$|^$/) {
push @termFinders, GO::TermFinder->new(annotationProvider=> $annotation,
ontologyProvider => $component,
totalNumGenes => $totalNum,
aspect => 'C');
}
if ($aspect =~ /^F$|^$/) {
push @termFinders, GO::TermFinder->new(annotationProvider=> $annotation,
ontologyProvider => $function,
totalNumGenes => $totalNum,
aspect => 'F');
}
}
my $report = GO::TermFinderReport::Text->new();
my $cutoff = 0.1;
# now go through each file
foreach my $file (@ARGV){
print "Analyzing $file\n";
my @genes = GenesFromFile($file);
my (@list, @notFound, @ambiguous);
CategorizeGenes(annotation => $annotation,
genes => \@genes,
ambiguous => \@ambiguous,
unambiguous => \@list,
notFound => \@notFound);
my $outfile = $file.".terms";
my $fh = IO::File->new($outfile, q{>} )|| die "Cannot make $outfile : $!";
print "Results being put in $outfile\n";
if (@list){
print $fh "The following gene(s) will be considered:\n\n";
foreach my $gene (@list){
print $fh $gene, "\t", $annotation->standardNameByName($gene), "\n";
}
print $fh "\n";
}else{
print $fh "None of the gene names were recognized\n";
print $fh "They were:\n\n";
print $fh join("\n", @notFound), "\n";
$fh->close;
next;
}
if (@ambiguous){
# note, some of these ambiguous names would be perfectly fine
# if put into GO::TermFinder if they are also standard names.
# Currently the behavior of analyze.pl differs from the
# default behavior of GO::TermFinder
print $fh "The following gene(s) are ambiguously named, and so will not be used:\n";
print $fh join("\n", @ambiguous), "\n\n";
}
if (@notFound){
print $fh "The following gene(s) were not recognized, and will not be considered:\n\n";
print $fh join("\n", @notFound), "\n\n";
}
foreach my $termFinder (@termFinders){
# it's possible that the supplied number of genes on the
# command line was less than indicated by the annotation
# provider, and thus the TermFinder may have used a larger
# number than was entered on the command line.
my $totalNumGenesUsedInBackground = $termFinder->totalNumGenes;
print $fh "Finding terms for ", $termFinder->aspect, "\n\n";
my @pvalues = $termFinder->findTerms(genes => \@list, calculateFDR => 1);
if($#pvalues == 0) {
print "WARNIING: NO p-value structures returned by findTerms(";
print join ",", @list;
print ")\n";
print $fh "\n\n";
$fh->close;
exit();
}
my $numHypotheses = $report->print(pvalues => \@pvalues,
numGenes => scalar(@list),
totalNum => $totalNumGenesUsedInBackground,
cutoff => $cutoff,
fh => $fh);
my $numProcesses = $#pvalues + 1;
print "Number of GO processes found: $numProcesses\n";
print "Number of hypotheses passed cutoff: $numHypotheses\n";
# if they had no significant P-values
if ($numHypotheses == 0){
print $fh "No terms were found for this aspect with a corrected P-value <= $cutoff.\n";
}
print $fh "\n\n";
}
$fh->close;
}
=pod
=head1 NAME
analyze.pl - batch processor to find terms for lists of genes in various files
=head1 SYNOPSIS
This program takes a list of files, each of which contain a list of
genes, with one gene per line. It will findTerms for the lists of
genes in each of the GO aspects, outputting the results to a file
named for the original file, but with a .terms extension. It will
only output terms with a corrected P-value of <= 0.05.
It will use the first supplied argument as the annotation file, the
second argument as the expected number of genes within the organism,
the third argument is the name of the obo file, and all subsequent
files as ones containing lists of genes.
Usage:
analyze.pl <annotation_file> <numGenes> <obofile> <file1> <file2> <file3> ... <fileN>
e.g.
analyze.pl ../t/gene_association.sgd 7200 ../t/gene_ontology_edit.obo genes.txt genes2.txt
An example output file might look like this:
The following gene(s) will be considered:
YDL235C YPD1
YDL224C WHI4
YDL225W SHS1
YDL226C GCS1
YDL227C HO
YDL228C YDL228C
YDL229W SSB1
YDL230W PTP1
YDL231C BRE4
YDL232W OST4
YDL233W YDL233W
YDL234C GYP7
Finding terms for P
Finding terms for C
Finding terms for F
-- 1 of 15--
GOID GO:0005096
TERM GTPase activator activity
CORRECTED P-VALUE 0.0113038452336839
UNCORRECTED P-VALUE 0.00113038452336839
NUM_ANNOTATIONS 2 of 12 in the list, vs 31 of 7272 in the genome
The genes annotated to this node are:
YDL234C, YDL226C
-- 2 of 15--
GOID GO:0008047
TERM enzyme activator activity
CORRECTED P-VALUE 0.0316194107645226
UNCORRECTED P-VALUE 0.00316194107645226
NUM_ANNOTATIONS 2 of 12 in the list, vs 52 of 7272 in the genome
The genes annotated to this node are:
YDL234C, YDL226C
-- 3 of 15--
GOID GO:0005083
TERM small GTPase regulatory/interacting protein activity
CORRECTED P-VALUE 0.0340606972468798
UNCORRECTED P-VALUE 0.00340606972468798
NUM_ANNOTATIONS 2 of 12 in the list, vs 54 of 7272 in the genome
The genes annotated to this node are:
YDL234C, YDL226C
-- 4 of 15--
GOID GO:0030695
TERM GTPase regulator activity
CORRECTED P-VALUE 0.0475469908576535
UNCORRECTED P-VALUE 0.00475469908576535
NUM_ANNOTATIONS 2 of 12 in the list, vs 64 of 7272 in the genome
The genes annotated to this node are:
YDL234C, YDL226C
=head1 AUTHORS
Gavin Sherlock, sherlock@genome.stanford.edu
=cut

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,55 @@
#!/usr/bin/perl
use strict;
use warnings;
use diagnostics;
use File::Map qw(map_file);
my $infile = shift;
my $input;
map_file $input, $infile;
{
local $_ = $input;
(my $f = $infile) =~ s/(.*\/)?(.*)(\.[^\.]*){2}/$2/;
my %orfgene = (/(Y\w+)\s+(\w+)\n/g);
my @indices = (/\Q-- \E(\d+) of \d+\Q --\E/g);
my @ids = (/GOID\s+GO:(\d+)/g);
my @terms = (/TERM\s+(.*?)\n/g);
my @pvalues = (/\nCORRECTED P-VALUE\s+(\d.*?)\n/g);
my @clusterf = (/NUM_ANNOTATIONS\s+(\d+ of \d+)/g);
my @bgfreq = (/, vs (\d+ of \d+) in the genome/g);
my @orfs = (/The genes annotated to this node are:\n(.*?)\n/g);
s/, /:/g for @orfs;
my @genes;
for my $orf (@orfs) {
my @otmp = split /:/, $orf;
my @gtmp = map { $orfgene{$_} } @otmp;
push @genes, (join ':', @gtmp);
}
&header();
for my $i (0 .. (@ids - 1)) {
&report($f, $ids[$i], $terms[$i], $pvalues[$i], $clusterf[$i], $bgfreq[$i], $orfs[$i], $genes[$i]);
}
}
sub header {
print "REMc ID\tGO_term ID\tGO-term\tCluster frequency\tBackground frequency\tP-value\tORFs\tGenes\n";
}
sub report {
my ($f, $id, $term, $p, $cfreq, $bgfreq, $orfs, $genes) = @_;
$cfreq =~ /(\d+) of (\d+)/;
$cfreq = sprintf "%d out of %d genes, %.1f%%", $1, $2, (100*$1/$2);
$bgfreq =~ /(\d+) of (\d+)/;
$bgfreq = sprintf "%d out of %d genes, %.1f%%", $1, $2, (100*$1/$2);
print "$f\t$id\t$term\t$cfreq\t$bgfreq\t$p\t$orfs\t$genes\n";
}