#bioinformatics #genomics #micro #sequence-analysis

yanked microBioRust-microSeqIO

Microbiology friendly bioinformatics Rust functions

new 0.1.1 Jan 17, 2025

#38 in #micro

MIT license

79KB
1K SLoC

microBioRust

A Rust bioinformatics crate aimed at Microbial genomics

The aim of this crate is to provide Microbiology friendly Rust functions for bioinformatics.

To use a specific workspace (at the moment microSeqIO or heatmap) clone the project, cd into the specific directory required and build the project from there

In microSeqIO:

You can parse genbank files and save as a GFF (gff3) format as well as extracting DNA sequences, gene DNA sequences (ffn) and protein fasta sequences (faa)

 pub fn genbank_to_faa() -> Result<(), anyhow::Error> {
             let args: Vec<String> = env::args().collect();
             let config = Config::new(&args).unwrap_or_else(|err| {
                println!("Problem with parsing file arguments: {}", err);
                process::exit(1);
                });
            let file_gbk = fs::File::open(config.filename)?;
            let mut reader = Reader::new(file_gbk);
            let mut records = reader.records();
           let mut cds_counter: u32 = 0;
               loop {
                //collect from each record advancing on a next record basis, count cds records
                match records.next() {       
                    Some(Ok(mut record)) => {
                                  for (k, v) in &record.cds.attributes {
                                      match record.seq_features.get_sequence_faa(&k) {
                                          Some(value) => { let seq_faa = value.to_string();
                                                           println!(">{}|{}\n{}", &record.id, &k, seq_faa);
                                                           },
                                          _ => (),
                                         };
                    
                                      }
                                      cds_counter+=1;
                                      },
                    Some(Err(e)) => { println!("Error encountered - an err {:?}", e); },
                   None => {
                          println!("finished iteration");
                         break; },
                    }
                 }
            println!("Total records processed: {}", read_counter);
           return Ok(());
  }

Example to save a provided multi- or single genbank file as a GFF file (by joining any multi-genbank)

    pub fn genbank_to_gff() -> io::Result<()> {
        let args: Vec<String> = env::args().collect();
        let config = Config::new(&args).unwrap_or_else(|err| {
            println!("Problem with parsing file arguments: {}", err);
            process::exit(1);
            });
        let file_gbk = fs::File::open(&config.filename)?;
        let prev_start: u32 = 0;
       let mut prev_end: u32 = 0;
        let mut reader = Reader::new(file_gbk);
        let mut records = reader.records();
        let mut read_counter: u32 = 0;
        let mut seq_region: BTreeMap<String, (u32,u32)> = BTreeMap::new();
        let mut record_vec: Vec<Record> = Vec::new();
        loop {  
            match records.next() {   
                Some(Ok(mut record)) => {
                    //println!("next record");
                    //println!("Record id: {:?}", record.id);
                    let source = record.source_map.source_name.clone().expect("issue collecting source name");
                    let beginning = match record.source_map.get_start(&source) {
                                     Some(value) => value.get_value(),
                                     _ => 0,
                                     };
                   let ending = match record.source_map.get_stop(&source) {
                                     Some(value) => value.get_value(),
                                     _ => 0,
                                     };
                  if ending + prev_end < beginning + prev_end {
                       }
                    seq_region.insert(source, (beginning + prev_end, ending + prev_end));
                    record_vec.push(record);
                    // Add additional fields to print if needed
                    read_counter+=1;
                   prev_end+=ending; // create the joined record if there are multiple
                   },
                Some(Err(e)) => { println!("theres an err {:?}", e); },
                 None => {
                    println!("finished iteration");
                          break; },
                }
            }
        let output_file = format!("{}.gff", &config.filename);
        gff_write(seq_region.clone(), record_vec, &output_file, true);
        println!("Total records processed: {}", read_counter);
        return Ok(());

Example to create a completely new record, use of setters or set_ functionality

To write into GFF format requires gff_write(seq_region, record_vec, filename, true or false)

The seq_region is the region of interest to save with name and DNA coordinates such as seqregion.entry("source_1".to_string(), (1,897)) This makes it possible to save the whole file or to subset it

record_vec is a list of the records. If there is only one record, include this as a vec using vec![record]

The boolean true/false describes whether the DNA sequence should be included in the GFF3 file

To write into genbank format requires gbk_write(seq_region, record_vec, filename), no true or false since genbank format will include the DNA sequence

   pub fn create_new_record() -> Result<(), anyhow::Error> {
       let filename = format!("new_record.gff");
        let mut record = Record::new();
       let mut seq_region: BTreeMap<String, (u32,u32)> = BTreeMap::new();
       //example from E.coli K12
        seq_region.insert("source_1".to_string(), (1,897));
        //Add the source into SourceAttributes
        record.source_map
             .set_counter("source_1".to_string())
             .set_start(RangeValue::Exact(1))
             .set_stop(RangeValue::Exact(897))
             .set_organism("Escherichia coli".to_string())
             .set_mol_type("DNA".to_string())
             .set_strain("K-12 substr. MG1655".to_string())
             .set_type_material("type strain of Escherichia coli K12".to_string())
             .set_db_xref("PRJNA57779".to_string());
        //Add the features into FeatureAttributes, here we are setting two features, i.e. coding sequences or genes
        record.cds
                .set_counter("b3304".to_string())
                .set_start(RangeValue::Exact(1))
                .set_stop(RangeValue::Exact(354))
                .set_gene("rplR".to_string())
                .set_product("50S ribosomal subunit protein L18".to_string())
                 .set_codon_start(1)
                 .set_strand(-1);
        record.cds
                .set_counter("b3305".to_string())
                .set_start(RangeValue::Exact(364))
                 .set_stop(RangeValue::Exact(897))
                 .set_gene("rplF".to_string())
                .set_product("50S ribosomal subunit protein L6".to_string())
                .set_codon_start(1)
                .set_strand(-1);
        //Add the sequences for the coding sequence (CDS) into SequenceAttributes
        record.seq_features
             .set_counter("b3304".to_string())
             .set_start(RangeValue::Exact(1))
                .set_stop(RangeValue::Exact(354))
                .set_sequence_ffn("ATGGATAAGAAATCTGCTCGTATCCGTCGTGCGACCCGCGCACGCCGCAAGCTCCAGGAG
CTGGGCGCAACTCGCCTGGTGGTACATCGTACCCCGCGTCACATTTACGCACAGGTAATT
GCACCGAACGGTTCTGAAGTTCTGGTAGCTGCTTCTACTGTAGAAAAAGCTATCGCTGAA
CAACTGAAGTACACCGGTAACAAAGACGCGGCTGCAGCTGTGGGTAAAGCTGTCGCTGAA
CGCGCTCTGGAAAAAGGCATCAAAGATGTATCCTTTGACCGTTCCGGGTTCCAATATCAT
GGTCGTGTCCAGGCACTGGCAGATGCTGCCCGTGAAGCTGGCCTTCAGTTCTAA".to_string())
                .set_sequence_faa("MDKKSARIRRATRARRKLQELGATRLVVHRTPRHIYAQVIAPNGSEVLVAASTVEKAIAE
QLKYTGNKDAAAAVGKAVAERALEKGIKDVSFDRSGFQYHGRVQALADAAREAGLQF".to_string())
                .set_codon_start(1)
               .set_strand(-1);
        record.seq_features
             .set_counter("bb3305".to_string())
             .set_start(RangeValue::Exact(364))
                .set_stop(RangeValue::Exact(897))
                .set_sequence_ffn("ATGTCTCGTGTTGCTAAAGCACCGGTCGTTGTTCCTGCCGGCGTTGACGTAAAAATCAAC
GGTCAGGTTATTACGATCAAAGGTAAAAACGGCGAGCTGACTCGTACTCTCAACGATGCT
GTTGAAGTTAAACATGCAGATAATACCCTGACCTTCGGTCCGCGTGATGGTTACGCAGAC
GGTTGGGCACAGGCTGGTACCGCGCGTGCCCTGCTGAACTCAATGGTTATCGGTGTTACC
GAAGGCTTCACTAAGAAGCTGCAGCTGGTTGGTGTAGGTTACCGTGCAGCGGTTAAAGGC
AATGTGATTAACCTGTCTCTGGGTTTCTCTCATCCTGTTGACCATCAGCTGCCTGCGGGT
ATCACTGCTGAATGTCCGACTCAGACTGAAATCGTGCTGAAAGGCGCTGATAAGCAGGTG
ATCGGCCAGGTTGCAGCGGATCTGCGCGCCTACCGTCGTCCTGAGCCTTATAAAGGCAAG
GGTGTTCGTTACGCCGACGAAGTCGTGCGTACCAAAGAGGCTAAGAAGAAGTAA".to_string())
                .set_sequence_faa("MSRVAKAPVVVPAGVDVKINGQVITIKGKNGELTRTLNDAVEVKHADNTLTFGPRDGYAD
GWAQAGTARALLNSMVIGVTEGFTKKLQLVGVGYRAAVKGNVINLSLGFSHPVDHQLPAG
ITAECPTQTEIVLKGADKQVIGQVAADLRAYRRPEPYKGKGVRYADEVVRTKEAKKK".to_string())
                .set_codon_start(1)
                .set_strand(-1);
        //Add the full sequence of the entire record into the record.sequence
        record.sequence = "TTAGAACTGAAGGCCAGCTTCACGGGCAGCATCTGCCAGTGCCTGGACACGACCATGATA
TTGGAACCCGGAACGGTCAAAGGATACATCTTTGATGCCTTTTTCCAGAGCGCGTTCAGC
GACAGCTTTACCCACAGCTGCAGCCGCGTCTTTGTTACCGGTGTACTTCAGTTGTTCAGC
GATAGCTTTTTCTACAGTAGAAGCAGCTACCAGAACTTCAGAACCGTTCGGTGCAATTAC
CTGTGCGTAAATGTGACGCGGGGTACGATGTACCACCAGGCGAGTTGCGCCCAGCTCCTG
GAGCTTGCGGCGTGCGCGGGTCGCACGACGGATACGAGCAGATTTCTTATCCATAGTGTT
ACCTTACTTCTTCTTAGCCTCTTTGGTACGCACGACTTCGTCGGCGTAACGAACACCCTT
GCCTTTATAAGGCTCAGGACGACGGTAGGCGCGCAGATCCGCTGCAACCTGGCCGATCAC
CTGCTTATCAGCGCCTTTCAGCACGATTTCAGTCTGAGTCGGACATTCAGCAGTGATACC
CGCAGGCAGCTGATGGTCAACAGGATGAGAGAAACCCAGAGACAGGTTAATCACATTGCC
TTTAACCGCTGCACGGTAACCTACACCAACCAGCTGCAGCTTCTTAGTGAAGCCTTCGGT
AACACCGATAACCATTGAGTTCAGCAGGGCACGCGCGGTACCAGCCTGTGCCCAACCGTC
TGCGTAACCATCACGCGGACCGAAGGTCAGGGTATTATCTGCATGTTTAACTTCAACAGC
ATCGTTGAGAGTACGAGTCAGCTCGCCGTTTTTACCTTTGATCGTAATAACCTGACCGTT
GATTTTTACGTCAACGCCGGCAGGAACAACGACCGGTGCTTTAGCAACACGAGACAT".to_string();
          gff_write(seq_region, vec![record], &filename, true);
       return Ok(());
     }

Dependencies

~20MB
~344K SLoC