ai
  • Crypto News
  • Ai
  • eSports
  • Bitcoin
  • Ethereum
  • Blockchain
Home»Ai»How to Create a Bioinformatics AI Agent Using Biopython for DNA and Protein Analysis
Ai

How to Create a Bioinformatics AI Agent Using Biopython for DNA and Protein Analysis

Share
Facebook Twitter LinkedIn Pinterest Email
class BioPythonAIAgent:
   def __init__(self, email="[email protected]"):
       self.email = email
       Entrez.email = email
       self.sequences = {}
       self.analysis_results = {}
       self.alignments = {}
       self.trees = {}
  
   def fetch_sequence_from_ncbi(self, accession_id, db="nucleotide", rettype="fasta"):
       try:
           handle = Entrez.efetch(db=db, id=accession_id, rettype=rettype, retmode="text")
           record = SeqIO.read(handle, "fasta")
           handle.close()
           self.sequences[accession_id] = record
           return record
       except Exception as e:
           print(f"Error fetching sequence: {str(e)}")
           return None
  
   def create_sample_sequences(self):
       covid_spike = "MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHVSGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPFLGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPINLVRDLPQGFSALEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFLLKYNENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQDVNCTEVPVAIHADQLTPTWRVYSTGSNVFQTRAGCLIGAEHVNNSYECDIPIGAGICASYQTQTNSPRRARSVASQSIIAYTMSLGAENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLNRALTGIAVEQDKNTQEVFAQVKQIYKTPPIKDFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDILSRLDKVEAEVQIDRLITGRLQSLQTYVTQQLIRAAEIRASANLAATKMSECVLGQSKRVDFCGKGYHLMSFPQSAPHGVVFLHVTYVPAQEKNFTTAPAICHDGKAHFPREGVFVSNGTHWFVTQRNFYEPQIITTDNTFVSGNCDVVIGIVNNTVYDPLQPELDSFKEELDKYFKNHTSPDVDLGDISGINASVVNIQKEIDRLNEVAKNLNESLIDLQELGKYEQYIKWPWYIWLGFIAGLIAIVMVTIMLCCMTSCCSCLKGCCSCGSCCKFDEDDSEPVLKGVKLHYT"
      
       human_insulin = "MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN"
      
       e_coli_16s = "AAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAACGGTAACAGGAAGCAGCTTGCTGCTTTGCTGACGAGTGGCGGACGGGTGAGTAATGTCTGGGAAACTGCCTGATGGAGGGGGATAACTACTGGAAACGGTAGCTAATACCGCATAATGTCGCAAGACCAAAGAGGGGGACCTTCGGGCCTCTTGCCATCGGATGTGCCCAGATGGGATTAGCTAGTAGGTGGGGTAACGGCTCACCTAGGCGACGATCCCTAGCTGGTCTGAGAGGATGACCAGCCACACTGGAACTGAGACACGGTCCAGACTCCTACGGGAGGCAGCAGTGGGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCCATGCCGCGTGTATGAAGAAGGCCTTCGGGTTGTAAAGTACTTTCAGCGGGGAGGAAGGCGTTAAGGTTAATAACCTTGGCGATTGACGTTACCCGCAGAAGAAGCACCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTCTGTCAAGTCGGATGTGAAATCCCCGGGCTCAACCTGGGAACTGCATCTGATACTGGCAAGCTTGAGTCTCGTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTGAAATGCGTAGAGATCTGGAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACAAAGACTGACGCTCAGGTGCGAAAGCGTGGGGAGCAAACA"
      
       sample_sequences = [
           ("COVID_Spike", covid_spike, "SARS-CoV-2 Spike Protein"),
           ("Human_Insulin", human_insulin, "Human Insulin Precursor"),
           ("E_coli_16S", e_coli_16s, "E. coli 16S rRNA")
       ]
      
       for seq_id, seq_str, desc in sample_sequences:
           record = SeqRecord(Seq(seq_str), id=seq_id, description=desc)
           self.sequences[seq_id] = record
      
       return sample_sequences
  
   def analyze_sequence(self, sequence_id=None, sequence=None):
       if sequence_id and sequence_id in self.sequences:
           seq_record = self.sequences[sequence_id]
           seq = seq_record.seq
           description = seq_record.description
       elif sequence:
           seq = Seq(sequence)
           description = "Custom sequence"
       else:
           return None
      
       analysis = {
           'length': len(seq),
           'composition': {}
       }
      
       for base in ['A', 'T', 'G', 'C']:
           analysis['composition'][base] = seq.count(base)
      
       if 'A' in analysis['composition'] and 'T' in analysis['composition']:
           analysis['gc_content'] = round(gc_fraction(seq) * 100, 2)
           try:
               analysis['molecular_weight'] = round(molecular_weight(seq, seq_type="DNA"), 2)
           except:
               analysis['molecular_weight'] = len(seq) * 650
      
       try:
           if len(seq) % 3 == 0:
               protein = seq.translate()
               analysis['translation'] = str(protein)
               analysis['stop_codons'] = protein.count('*')
              
               if '*' not in str(protein)[:-1]:
                   prot_analysis = ProteinAnalysis(str(protein)[:-1])
                   analysis['protein_mw'] = round(prot_analysis.molecular_weight(), 2)
                   analysis['isoelectric_point'] = round(prot_analysis.isoelectric_point(), 2)
                   analysis['protein_composition'] = prot_analysis.get_amino_acids_percent()
       except:
           pass
      
       key = sequence_id if sequence_id else "custom"
       self.analysis_results[key] = analysis
      
       return analysis
  
   def visualize_composition(self, sequence_id):
       if sequence_id not in self.analysis_results:
           return
      
       analysis = self.analysis_results[sequence_id]
      
       fig = make_subplots(
           rows=2, cols=2,
           specs=[[{"type": "pie"}, {"type": "bar"}],
                  [{"colspan": 2}, None]],
           subplot_titles=("Nucleotide Composition", "Base Count", "Sequence Properties")
       )
      
       labels = list(analysis['composition'].keys())
       values = list(analysis['composition'].values())
      
       fig.add_trace(
           go.Pie(labels=labels, values=values, name="Composition"),
           row=1, col=1
       )
      
       fig.add_trace(
           go.Bar(x=labels, y=values, name="Count", marker_color=['red', 'blue', 'green', 'orange']),
           row=1, col=2
       )
      
       properties = ['Length', 'GC%', 'MW (kDa)']
       prop_values = [
           analysis['length'],
           analysis.get('gc_content', 0),
           analysis.get('molecular_weight', 0) / 1000
       ]
      
       fig.add_trace(
           go.Scatter(x=properties, y=prop_values, mode="markers+lines",
                     marker=dict(size=10, color="purple"), name="Properties"),
           row=2, col=1
       )
      
       fig.update_layout(
           title=f"Comprehensive Analysis: {sequence_id}",
           showlegend=False,
           height=600
       )
      
       fig.show()
  
   def perform_multiple_sequence_alignment(self, sequence_ids):
       if len(sequence_ids)  1:
           fig = make_subplots(
               rows=2, cols=2,
               subplot_titles=("Length Comparison", "GC Content", "Molecular Weight", "Composition Heatmap")
           )
          
           fig.add_trace(
               go.Bar(x=df['sequence_id'], y=df['length'], name="Length"),
               row=1, col=1
           )
          
           if 'gc_content' in df.columns:
               fig.add_trace(
                   go.Scatter(x=df['sequence_id'], y=df['gc_content'], mode="markers+lines", name="GC%"),
                   row=1, col=2
               )
          
           if 'molecular_weight' in df.columns:
               fig.add_trace(
                   go.Bar(x=df['sequence_id'], y=df['molecular_weight'], name="MW"),
                   row=2, col=1
               )
          
           fig.update_layout(title="Comparative Sequence Analysis", height=600)
           fig.show()
      
       return df
  
   def codon_usage_analysis(self, sequence_id):
       if sequence_id not in self.sequences:
           return None
      
       seq = self.sequences[sequence_id].seq
      
       if len(seq) % 3 != 0:
           return None
      
       codons = {}
       for i in range(0, len(seq) - 2, 3):
           codon = str(seq[i:i+3])
           codons[codon] = codons.get(codon, 0) + 1
      
       codon_df = pd.DataFrame(list(codons.items()), columns=['Codon', 'Count'])
       codon_df = codon_df.sort_values('Count', ascending=False)
      
       fig = px.bar(codon_df.head(20), x='Codon', y='Count',
                    title=f"Top 20 Codon Usage - {sequence_id}")
       fig.show()
      
       return codon_df
  
   def motif_search(self, sequence_id, motif_pattern):
       if sequence_id not in self.sequences:
           return []
      
       seq = str(self.sequences[sequence_id].seq)
       positions = []
      
       for i in range(len(seq) - len(motif_pattern) + 1):
           if seq[i:i+len(motif_pattern)] == motif_pattern:
               positions.append(i)
      
       return positions
  
   def gc_content_window(self, sequence_id, window_size=100):
       if sequence_id not in self.sequences:
           return None
      
       seq = self.sequences[sequence_id].seq
       gc_values = []
       positions = []
      
       for i in range(0, len(seq) - window_size + 1, window_size//4):
           window = seq[i:i+window_size]
           gc_values.append(gc_fraction(window) * 100)
           positions.append(i + window_size//2)
      
       fig = go.Figure()
       fig.add_trace(go.Scatter(x=positions, y=gc_values, mode="lines+markers",
                               name=f'GC Content (window={window_size})'))
       fig.update_layout(
           title=f"GC Content Sliding Window Analysis - {sequence_id}",
           xaxis_title="Position",
           yaxis_title="GC Content (%)"
       )
       fig.show()
      
       return positions, gc_values
  
   def run_comprehensive_analysis(self, sequence_ids):
       results = {}
      
       for seq_id in sequence_ids:
           if seq_id in self.sequences:
               analysis = self.analyze_sequence(seq_id)
               self.visualize_composition(seq_id)
              
               gc_analysis = self.gc_content_window(seq_id)
               codon_analysis = self.codon_usage_analysis(seq_id)
              
               results[seq_id] = {
                   'basic_analysis': analysis,
                   'gc_window': gc_analysis,
                   'codon_usage': codon_analysis
               }
      
       if len(sequence_ids) > 1:
           comparative_df = self.comparative_analysis(sequence_ids)
           results['comparative'] = comparative_df
      
       return results
Share. Facebook Twitter Pinterest LinkedIn Tumblr Email

Related Posts

Meta Superintelligence Labs Introduces REFRAG: Scaling RAG with 16× Longer Contexts and 31× Faster Decoding

septembre 8, 2025

From Pretraining to Post-Training: Why Language Models Hallucinate and How Evaluation Methods Reinforce the Problem

septembre 7, 2025

AI and machine learning for engineering design | MIT News

septembre 7, 2025

Tilde AI Releases TildeOpen LLM: An Open-Source Large Language Model with Over 30 Billion Parameters and Support Most European Languages

septembre 7, 2025
Add A Comment

Comments are closed.

Top Posts

SwissCryptoDaily.ch delivers the latest cryptocurrency news, market insights, and expert analysis. Stay informed with daily updates from the world of blockchain and digital assets.

We're social. Connect with us:

Facebook X (Twitter) Instagram Pinterest YouTube
Top Insights

How Rich Is Ethereum’s Vitalik Buterin? Arkham’s 2025 Report

septembre 8, 2025

Dogecoin Mega Rally Ahead? Crypto Analyst Says $4 Is In Play

septembre 8, 2025

Germany Missed $5B Bitcoin in Piracy Case, Says Arkham

septembre 8, 2025
Get Informed

Subscribe to Updates

Get the latest creative news from FooBar about art, design and business.

Facebook X (Twitter) Instagram Pinterest
  • About us
  • Get In Touch
  • Cookies Policy
  • Privacy-Policy
  • Terms and Conditions
© 2025 Swisscryptodaily.ch.

Type above and press Enter to search. Press Esc to cancel.