Update Kraken databases
Kraken is a really good k-mer based classification tool. I frequently use this tool for viral signal detection in metagenomic samples. A number of useful scripts such as updating Kraken databases are provided with the tool. Since the NCBI updated the FTP website structure and decided to phase-out Genbank Idenfiers (GIs), the default Kraken database update scripts do not work. So I decided to write a little python script to update Kraken databases that some Kraken users might find useful.
This script automatically downloads:
- Human genome – most recent assembly version
- Complete bacterial reference genomes
- Complete viral reference genomes
- Archaeal genomes
- Reference plasmids sequences
This script takes an optional command-line argument which can be specified as the target location where the data should be downloaded and saved. By default, all files are downloaded in the present working directory.
To change default genome downloads e.g. download mouse reference genome instead of human; please make necessary changes in the code. Feel free to fork this script on github.
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ @author: sejalmodha """ #import OS module to use os methods and functions import os import subprocess import sys import re # you'll see this alias in documentation, examples, etc. import pandas as pd #import biopython SeqIO from Bio import SeqIO #set present working directory if len(sys.argv) > 1: cwd = sys.argv[1] else: cwd=os.getcwd() #os.chdir(pwd) print(cwd) #get the current working directory os.chdir(cwd) print(os.getcwd()) #function to process ftp url file that is created from assembly files def process_url_file(inputurlfile): url_file=open(inputurlfile,'r') file_suffix=r'genomic.gbff.gz' for line in url_file: url=line.rstrip('\n').split(',') ftp_url= url[0]+'/'+url[1]+'_'+url[2]+'_'+file_suffix #print(new_url) print("Downloading"+ ftp_url) #Download the files in the gbff format subprocess.call("wget "+ftp_url,shell=True) #unzip the files subprocess.call("gunzip *.gz",shell=True) return #function to download bacterial sequences def download_bacterial_genomes(outfile='outfile.txt'): assembly_summary_file=r'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/assembly_summary.txt' if os.path.exists('assembly_summary.txt'): os.remove('assembly_summary.txt') #Download the file using wget sysyem call subprocess.call("wget "+assembly_summary_file, shell=True) #Reformat the file to pandas-friendly format subprocess.call("sed -i '1d' assembly_summary.txt",shell=True) subprocess.call("sed -i 's/^# //' assembly_summary.txt", shell=True) #Read the file as a dataframe - using read_table #Use read_table if the column separator is tab assembly_sum = pd.read_table('assembly_summary.txt') #filter the dataframe and save the URLs of the complete genomes in a new file my_df=assembly_sum[(assembly_sum['version_status'] == 'latest') & (assembly_sum['assembly_level']=='Complete Genome') ] my_df=my_df[['ftp_path','assembly_accession','asm_name']] #output_file.write my_df.to_csv(outfile,mode='w',index=False,header=None) process_url_file(outfile) return #function to download reference genomes #this function downloads latest version human reference genome by default def download_refseq_genome(taxid=9606,outfile='refseq_genome.txt'): assembly_summary_file="ftp://ftp.ncbi.nih.gov/genomes/refseq/assembly_summary_refseq.txt" if os.path.exists('assembly_summary_refseq.txt'): os.remove('assembly_summary_refseq.txt') #Download the file using wget sysyem call subprocess.call("wget "+assembly_summary_file, shell=True) #Reformat the file to pandas-friendly format subprocess.call("sed -i '1d' assembly_summary_refseq.txt",shell=True) subprocess.call("sed -i 's/^# //' assembly_summary_refseq.txt", shell=True) #Read the file as a dataframe - using read_table #Use read_table if the column separator is tab assembly_sum = pd.read_table('assembly_summary_refseq.txt') my_df=assembly_sum[(assembly_sum['taxid'] == taxid) & ((assembly_sum['refseq_category'] == 'reference genome') | (assembly_sum['refseq_category'] == 'representative genome') )] my_df=my_df[['ftp_path','assembly_accession','asm_name']] #Process the newly created file and download genomes from NCBI website my_df.to_csv(outfile,mode='w',index=False,header=None) process_url_file(outfile) return #format genbank files to generate kraken-friendly formatted fasta files def get_fasta_in_kraken_format(outfile_fasta='sequences.fa'): output=open(outfile_fasta,'w') for file_name in os.listdir(cwd): if file_name.endswith('.gbff'): records = SeqIO.parse(file_name, "genbank") for seq_record in records: seq_id=seq_record.id seq=seq_record.seq for feature in seq_record.features: if 'source' in feature.type: print(feature.qualifiers) taxid=''.join(feature.qualifiers['db_xref']) taxid=re.sub(r'.*taxon:','kraken:taxid|',taxid) print(''.join(taxid)) outseq=">"+seq_id+"|"+taxid+"\n"+str(seq)+"\n" output.write(outseq) os.remove(file_name) output.close() return print('Downloading human genome'+'\n') #change argument in the following function if you want to download other reference genomes #taxonomy ID 9606 (human) should be replaced with taxonomy ID of genome of interest download_refseq_genome(9606,'human_genome_url.txt') print('Converting sequences to kraken input format'+'\n') get_fasta_in_kraken_format('human_genome.fa') #print('Downloading rat genome'+'\n') #download_refseq_genome(10116,'rat_genome_url.txt') #print('Converting sequences to kraken input format'+'\n') #get_fasta_in_kraken_format('rat_genome.fa') print('Downloading bacterial genomes'+'\n') download_bacterial_genomes('bacterial_complete_genome_url.txt') print('Converting sequences to kraken input format'+'\n') get_fasta_in_kraken_format('bacterial_genomes.fa') print('Downloading viral genomes'+'\n') subprocess.call('wget ftp://ftp.ncbi.nlm.nih.gov/refseq/release/viral/viral.*.genomic.gbff.gz',shell=True) subprocess.call('gunzip *.gz',shell=True) print('Converting sequences to kraken input format'+'\n') get_fasta_in_kraken_format('viral_genomes.fa') print('Downloading archaeal genomes'+'\n') subprocess.call('wget ftp://ftp.ncbi.nlm.nih.gov/refseq/release/archaea/archaea.*.genomic.gbff.gz',shell=True) subprocess.call('gunzip *.gz',shell=True) print('Converting sequences to kraken input format'+'\n') get_fasta_in_kraken_format('archaeal_genomes.fa') print('Downloading plasmid sequences'+'\n') subprocess.call('wget ftp://ftp.ncbi.nlm.nih.gov/refseq/release/plasmid/plasmid.*.genomic.gbff.gz', shell=True) subprocess.call('gunzip *.gz',shell=True) print('Converting sequences to kraken input format'+'\n') get_fasta_in_kraken_format('plasmids_genomes.fa') #Set name for the krakendb directory krakendb='HumanVirusBacteria' subprocess.call('kraken-build --download-taxonomy --db '+krakendb, shell=True) print('Running Kraken DB build for '+krakendb+'\n') print('This might take a while '+'\n') for fasta_file in os.listdir(cwd): if fasta_file.endswith('.fa') or fasta_file.endswith('.fasta'): print (fasta_file) subprocess.call('kraken-build --add-to-library '+fasta_file +' --db '+krakendb,shell=True) subprocess.call('kraken-build --build --db '+krakendb+' --threads 12',shell=True)
Hello Sed Modha, I have been using your script but at some point the following error appears:
sys:1: DtypeWarning: Columns (20) have mixed types. Specify dtype option on import or set low_memory=False.
Traceback (most recent call last):
File “./UpdateKrakenDatabases.py”, line 118, in
get_fasta_in_kraken_format(‘human_genome.fa’)
File “./UpdateKrakenDatabases.py”, line 98, in get_fasta_in_kraken_format
for seq_record in records:
File “/aplic/GOOLF/1.6.10/Python/3.3.2/lib/python3.3/site-packages/Bio/SeqIO/__init__.py”, line 600, in parse
for r in i:
File “/aplic/GOOLF/1.6.10/Python/3.3.2/lib/python3.3/site-packages/Bio/GenBank/Scanner.py”, line 478, in parse_records
record = self.parse(handle, do_features)
File “/aplic/GOOLF/1.6.10/Python/3.3.2/lib/python3.3/site-packages/Bio/GenBank/Scanner.py”, line 462, in parse
if self.feed(handle, consumer, do_features):
File “/aplic/GOOLF/1.6.10/Python/3.3.2/lib/python3.3/site-packages/Bio/GenBank/Scanner.py”, line 430, in feed
self._feed_header_lines(consumer, self.parse_header())
File “/aplic/GOOLF/1.6.10/Python/3.3.2/lib/python3.3/site-packages/Bio/GenBank/Scanner.py”, line 1436, in _feed_header_lines
structured_comment_key = re.search(r”([^#]+){0}$”.format(STRUCTURED_COMMENT_START), data).group(1)
AttributeError: ‘NoneType’ object has no attribute ‘group’
Do you know what it can be?
Thank you very much,
Guillermo