conda activate ncbi_datasets
# Loop over each accession number in the file NovCor_TableS1_accessions.txt and get the full metadata from NCBI (file must be UnixLF)
header_written=false
while read -r accessionID; do
if [ "$header_written" = false ]; then
datasets summary genome accession "$accessionID" --as-json-lines | dataformat tsv genome > "data/genome_lists/NovCor_TableS1_NCBI.csv"
header_written=true
else
datasets summary genome accession "$accessionID" --as-json-lines | dataformat tsv genome --elide-header >> "data/genome_lists/NovCor_TableS1_NCBI.csv"
fi
done < "data/genome_lists/NovCor_TableS1_accessions.txt"
Genomes Summary
See link for output description: https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_genome/
<- read_delim("data/genome_lists/NovCor_TableS1_NCBI.csv")
NCBI_metadata
<- NCBI_metadata %>%
NCBI_metadata select(all_of("Assembly Accession"), contains("Assembly Stats"), contains("CheckM")) %>%
distinct() %>%
select_if(~ any(!is.na(.)))
<- read_excel("data/genome_lists/NovCor_TableS1_StrainGenomes.xlsx")
TableS1 <- left_join(TableS1, NCBI_metadata, join_by(`Assembly Accession`)) TableS1
write_csv(TableS1, "data/genome_lists/NovCor_TableS1_StrainGenomes_Updated.csv")
This table was renamed as Table S3 in the final version of the manuscript