[程式] 如何用perl找出intergenic region

看板BioMedInfo (生醫資訊)作者 (請天空給我)時間13年前 (2011/07/31 01:26), 編輯推噓0(008)
留言8則, 3人參與, 最新討論串1/1
小的是perl新手 以下是參考用perl找到CDS的程式碼 將start_hash的location改為end end_hash的location改為start 想請問如何修改才能得到所需的intergenic region 謝謝~ #!/usr/bin/perl use strict; use warnings; use Getopt::Long; use File::Basename; use Bio::SeqIO; my $in_file; my $out_dir; my $id_tag; my $print_aa; my $print_nt; my $print_gg; my $print_loc; my $verbose; my $debug; GetOptions( "in_file=s" => \$in_file, "out_dir=s" => \$out_dir, "id_tag=s" => \$id_tag, "print_aa=i" => \$print_aa, "print_nt=i" => \$print_nt, "print_gg=i" => \$print_gg, "print_loc=i" => \$print_loc, "verbose=i" => \$verbose, "debug=i" => \$debug ); $id_tag = $id_tag ? $id_tag : 'protein_id'; system "mkdir -p $out_dir" unless -e $out_dir; my $file_id; if ( $in_file =~ m/.*\/(\S+?)\./ ) { $file_id = $1; } else { die "error parsing file_id from $in_file\n"; } my $out_file = $out_dir . $file_id . '.out'; open OUT, ">$out_file" or die "Can't open output file $out_file\n"; my $err_file = $out_dir . $file_id . '.err'; open ERR, ">$err_file" or die "Can't open output file $err_file\n"; if ($print_aa) { my $aa_out = $out_dir . $file_id . '.aa.fasta'; open AA, ">$aa_out" or die "Can't open output file $aa_out\n"; } if ($print_nt) { my $nt_out = $out_dir . $file_id . '.nt.fasta'; open NT, ">$nt_out" or die "Can't open output file $nt_out\n"; } if ($print_gg) { my $gg_out = $out_dir . $file_id . '.gg'; open GG, ">$gg_out" or die "Can't open output file $gg_out\n"; } if ($print_loc) { my $loc_out = $out_dir . $file_id . '.igr.location'; open LOC, ">$loc_out" or die "Can't open output file $loc_out\n"; } my $seqio_object = Bio::SeqIO->new( -format => "genbank", -file => "$in_file" ); my $seq_name; my $count_seq = 0; my $count_cds_total = 0; my $count_non_coding_total = 0; print GG "$file_id\: "; while ( my $seq_object = $seqio_object->next_seq ) { my $accession = $seq_object->accession_number; $count_seq++; my %nt_seq_hash; # key = seq_name, value = nt_seq my %aa_seq_hash; # key = seq_name, value = aa_seq my %start_hash; # key = seq_name, value = start my %end_hash; # key = seq_name, value = end my %strand_hash; # key = seq_name, value = strand my $count_igr_in = 0; my $count_igr_out = 0; for my $feat_object ( $seq_object->get_SeqFeatures ) { if ( $feat_object->primary_tag eq 'CDS' ) { # ignore pseudo genes next if ( $feat_object->has_tag("pseudo") ); $count_igr_in++; if ( $feat_object->has_tag($id_tag) ) { for my $value ( $feat_object->get_tag_values($id_tag) ) { $seq_name = $value; } if ( $id_tag eq 'protein_id' && $seq_name =~ m/(\S+)\./ ) { $seq_name = $1; } $count_igr_out++; $start_hash{$seq_name} = $feat_object->location->end; $end_hash{$seq_name} = $feat_object->location->start; $strand_hash{$seq_name} = $feat_object->location->strand; $nt_seq_hash{$seq_name} = $feat_object->spliced_seq->seq; if ( $feat_object->has_tag("translation") ) { for my $value ( $feat_object->get_tag_values("translation") ) { $aa_seq_hash{$seq_name} = $value; } } else { print ERR "$file_id: $accession: $seq_name: no translation!\ n"; } } else { print ERR "$file_id: $accession: no $id_tag!\n"; } } } my $count_order = 0; foreach $seq_name ( sort { $start_hash{$a} $start_hash{$b} } keys %start_hash ) { $count_order++; print AA ">$seq_name\n$aa_seq_hash{$seq_name}\n" if ($print_aa); print NT ">$seq_name\n$nt_seq_hash{$seq_name}\n" if ($print_nt); print GG " $seq_name" if ($print_gg); print LOC "$seq_name\t$accession\t$count_order\t", "$start_hash{$seq_name}\t$end_hash{$seq_name}\t", "$strand_hash{$seq_name}\n" if ($print_loc); } print OUT "$file_id: $accession: count_igr_out = $count_igr_out\n"; if ( $count_igr_in != $count_igr_out ) { print ERR "$file_id: $accession: count_igr_in = $count_igr_in ", "!= count_igr_out = $count_igr_out!\n"; } $count_igr_total += $count_igr_out; } # while $seq_object print OUT "$file_id: count_seq = $count_seq, count_igr_total = $count_igr_total\ n"; print GG "\n" if ($print_gg); close OUT; close ERR; close AA if ($print_aa); close NT if ($print_nt); close GG if ($print_gg); close LOC if ($print_loc); exit(0); -- ※ 發信站: 批踢踢實業坊(ptt.cc) ◆ From: 125.226.136.153

07/31 10:26, , 1F
意思是$nt_seq_hash{$seq_name}不是你要的IGR嗎?
07/31 10:26, 1F

07/31 19:07, , 2F
恩恩 想請問$feat_object->primary_tag eq 'CDS'
07/31 19:07, 2F

07/31 19:09, , 3F
是指 讀入CDS片段嗎?
07/31 19:09, 3F

07/31 19:44, , 4F
這指的是找出當中被標上CDS標籤的部份
07/31 19:44, 4F

07/31 21:14, , 5F
不知道你是不是想藉由找出cds的位置來取得non-cds的序列,
07/31 21:14, 5F

07/31 21:18, , 6F
是的話你可能要用其他method取得序列,並確定基因上下游關係
07/31 21:18, 6F

07/31 21:52, , 7F
請問 若是想從Genbank所取得的序列 直接找出intergenic
07/31 21:52, 7F

07/31 21:53, , 8F
region 該如何下手呢? 謝謝~
07/31 21:53, 8F
文章代碼(AID): #1ED3xR4R (BioMedInfo)
文章代碼(AID): #1ED3xR4R (BioMedInfo)