[程式] 如何用perl找出intergenic region
小的是perl新手
以下是參考用perl找到CDS的程式碼
將start_hash的location改為end
end_hash的location改為start
想請問如何修改才能得到所需的intergenic region
謝謝~
#!/usr/bin/perl
use strict;
use warnings;
use Getopt::Long;
use File::Basename;
use Bio::SeqIO;
my $in_file;
my $out_dir;
my $id_tag;
my $print_aa;
my $print_nt;
my $print_gg;
my $print_loc;
my $verbose;
my $debug;
GetOptions(
"in_file=s" => \$in_file,
"out_dir=s" => \$out_dir,
"id_tag=s" => \$id_tag,
"print_aa=i" => \$print_aa,
"print_nt=i" => \$print_nt,
"print_gg=i" => \$print_gg,
"print_loc=i" => \$print_loc,
"verbose=i" => \$verbose,
"debug=i" => \$debug
);
$id_tag = $id_tag ? $id_tag : 'protein_id';
system "mkdir -p $out_dir" unless -e $out_dir;
my $file_id;
if ( $in_file =~ m/.*\/(\S+?)\./ ) {
$file_id = $1;
}
else {
die "error parsing file_id from $in_file\n";
}
my $out_file = $out_dir . $file_id . '.out';
open OUT, ">$out_file" or die "Can't open output file $out_file\n";
my $err_file = $out_dir . $file_id . '.err';
open ERR, ">$err_file" or die "Can't open output file $err_file\n";
if ($print_aa) {
my $aa_out = $out_dir . $file_id . '.aa.fasta';
open AA, ">$aa_out" or die "Can't open output file $aa_out\n";
}
if ($print_nt) {
my $nt_out = $out_dir . $file_id . '.nt.fasta';
open NT, ">$nt_out" or die "Can't open output file $nt_out\n";
}
if ($print_gg) {
my $gg_out = $out_dir . $file_id . '.gg';
open GG, ">$gg_out" or die "Can't open output file $gg_out\n";
}
if ($print_loc) {
my $loc_out = $out_dir . $file_id . '.igr.location';
open LOC, ">$loc_out" or die "Can't open output file $loc_out\n";
}
my $seqio_object = Bio::SeqIO->new(
-format => "genbank",
-file => "$in_file"
);
my $seq_name;
my $count_seq = 0;
my $count_cds_total = 0;
my $count_non_coding_total = 0;
print GG "$file_id\: ";
while ( my $seq_object = $seqio_object->next_seq ) {
my $accession = $seq_object->accession_number;
$count_seq++;
my %nt_seq_hash; # key = seq_name, value = nt_seq
my %aa_seq_hash; # key = seq_name, value = aa_seq
my %start_hash; # key = seq_name, value = start
my %end_hash; # key = seq_name, value = end
my %strand_hash; # key = seq_name, value = strand
my $count_igr_in = 0;
my $count_igr_out = 0;
for my $feat_object ( $seq_object->get_SeqFeatures ) {
if ( $feat_object->primary_tag eq 'CDS' ) {
# ignore pseudo genes
next if ( $feat_object->has_tag("pseudo") );
$count_igr_in++;
if ( $feat_object->has_tag($id_tag) ) {
for my $value ( $feat_object->get_tag_values($id_tag) ) {
$seq_name = $value;
}
if ( $id_tag eq 'protein_id' && $seq_name =~ m/(\S+)\./ ) {
$seq_name = $1;
}
$count_igr_out++;
$start_hash{$seq_name} = $feat_object->location->end;
$end_hash{$seq_name} = $feat_object->location->start;
$strand_hash{$seq_name} = $feat_object->location->strand;
$nt_seq_hash{$seq_name} = $feat_object->spliced_seq->seq;
if ( $feat_object->has_tag("translation") ) {
for my $value (
$feat_object->get_tag_values("translation")
) {
$aa_seq_hash{$seq_name} = $value;
}
}
else {
print ERR "$file_id: $accession: $seq_name: no
translation!\
n";
}
}
else {
print ERR "$file_id: $accession: no $id_tag!\n";
}
}
}
my $count_order = 0;
foreach $seq_name (
sort { $start_hash{$a} $start_hash{$b} }
keys %start_hash
)
{
$count_order++;
print AA ">$seq_name\n$aa_seq_hash{$seq_name}\n" if ($print_aa);
print NT ">$seq_name\n$nt_seq_hash{$seq_name}\n" if ($print_nt);
print GG " $seq_name" if ($print_gg);
print LOC "$seq_name\t$accession\t$count_order\t",
"$start_hash{$seq_name}\t$end_hash{$seq_name}\t",
"$strand_hash{$seq_name}\n" if ($print_loc);
}
print OUT "$file_id: $accession: count_igr_out = $count_igr_out\n";
if ( $count_igr_in != $count_igr_out ) {
print ERR "$file_id: $accession: count_igr_in = $count_igr_in ",
"!= count_igr_out = $count_igr_out!\n";
}
$count_igr_total += $count_igr_out;
} # while $seq_object
print OUT "$file_id: count_seq = $count_seq, count_igr_total =
$count_igr_total\
n";
print GG "\n" if ($print_gg);
close OUT;
close ERR;
close AA if ($print_aa);
close NT if ($print_nt);
close GG if ($print_gg);
close LOC if ($print_loc);
exit(0);
--
※ 發信站: 批踢踢實業坊(ptt.cc)
◆ From: 125.226.136.153
→
07/31 10:26, , 1F
07/31 10:26, 1F
→
07/31 19:07, , 2F
07/31 19:07, 2F
→
07/31 19:09, , 3F
07/31 19:09, 3F
→
07/31 19:44, , 4F
07/31 19:44, 4F
→
07/31 21:14, , 5F
07/31 21:14, 5F
→
07/31 21:18, , 6F
07/31 21:18, 6F
→
07/31 21:52, , 7F
07/31 21:52, 7F
→
07/31 21:53, , 8F
07/31 21:53, 8F
BioMedInfo 近期熱門文章
PTT職涯區 即時熱門文章