Hi @Pioneer82,
Take a look at this:
# Create example data using base R
# ------------------------------------------------------------------------------
# Set some params
set.seed(610868)
n_files = 100
n_seqs = 1000
l_seqs = 5
dna = c('A', 'C', 'T', 'G')
out_dir = '~/tmp'
# Create files
for( i in 1:n_files ){
seqs_for_file = rep(NA, n_seqs)
for( j in 1:n_seqs ){
seqs_for_file[j] = paste0(sample(x = dna, size = l_seqs, replace = TRUE),
collapse = '')
}
write.table(x = seqs_for_file, file = paste0(out_dir, '/seqs_', i, '.txt'),
quote = FALSE, row.names = FALSE, col.names = FALSE)
}
# Analyse files using Tidyverse
# ------------------------------------------------------------------------------
# Load libraries
library('tidyverse')
# Get input file names
target_files = list.files(path = out_dir, pattern = "\\.txt$")
# Read files into list of tibbles
d = map(str_c(out_dir, '/', target_files), read_table, col_names = FALSE)
# Set names in list to target files
names(d) = target_files
# To each tibble, add a variable with the target file name
d = lapply(names(d), function(n_i){ d[[n_i]] = mutate(d[[n_i]], f = n_i) })
# Combine to one tibble and set variable names
d = d %>% bind_rows %>% rename(input_file = f, seq = X1)
# Create count table
seq_count_table = d %>%
count(input_file, seq) %>%
spread(seq, n) %>%
mutate_if(is.numeric, replace_na, 0)
Yielding:
> seq_count_table
# A tibble: 100 x 1,025
input_file AAAAA AAAAC AAAAG AAAAT AAACA AAACC AAACG AAACT AAAGA AAAGC AAAGG AAAGT AAATA AAATC AAATG AAATT AACAA
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 seqs_1.txt 6 1 1 0 1 1 2 0 2 1 3 1 1 2 0 2 2
2 seqs_10.t… 0 0 0 0 1 1 3 2 0 2 0 1 0 2 0 0 1
3 seqs_100.… 0 0 0 0 0 0 1 0 3 1 0 1 2 0 2 1 1
4 seqs_11.t… 2 1 0 2 0 1 0 1 1 1 0 2 1 1 1 0 1
5 seqs_12.t… 2 1 2 0 2 1 1 0 0 1 2 0 0 1 2 0 2
6 seqs_13.t… 1 0 1 1 0 1 0 2 1 1 0 0 1 0 2 0 1
7 seqs_14.t… 0 1 2 1 0 1 0 1 0 1 1 1 0 4 1 0 1
8 seqs_15.t… 0 2 1 0 0 3 0 0 3 3 0 0 0 0 0 1 1
9 seqs_16.t… 1 1 0 0 2 0 0 1 0 1 0 0 1 0 1 3 0
10 seqs_17.t… 0 2 0 1 1 1 2 0 0 1 1 3 1 2 2 2 1
# … with 90 more rows, and 1,007 more variables: AACAC <dbl>, AACAG <dbl>, AACAT <dbl>, AACCA <dbl>, AACCC <dbl>,
# AACCG <dbl>, AACCT <dbl>, AACGA <dbl>, AACGC <dbl>, AACGG <dbl>, AACGT <dbl>, AACTA <dbl>, AACTC <dbl>,
# AACTG <dbl>, AACTT <dbl>, AAGAA <dbl>, AAGAC <dbl>, AAGAG <dbl>, AAGAT <dbl>, AAGCA <dbl>, AAGCC <dbl>,
# AAGCG <dbl>, AAGCT <dbl>, AAGGA <dbl>, AAGGC <dbl>, AAGGG <dbl>, AAGGT <dbl>, AAGTA <dbl>, AAGTC <dbl>,
# AAGTG <dbl>, AAGTT <dbl>, AATAA <dbl>, AATAC <dbl>, AATAG <dbl>, AATAT <dbl>, AATCA <dbl>, AATCC <dbl>,
# AATCG <dbl>, AATCT <dbl>, AATGA <dbl>, AATGC <dbl>, AATGG <dbl>, AATGT <dbl>, AATTA <dbl>, AATTC <dbl>,
# AATTG <dbl>, AATTT <dbl>, ACAAA <dbl>, ACAAC <dbl>, ACAAG <dbl>, ACAAT <dbl>, ACACA <dbl>, ACACC <dbl>,
# ACACG <dbl>, ACACT <dbl>, ACAGA <dbl>, ACAGC <dbl>, ACAGG <dbl>, ACAGT <dbl>, ACATA <dbl>, ACATC <dbl>,
# ACATG <dbl>, ACATT <dbl>, ACCAA <dbl>, ACCAC <dbl>, ACCAG <dbl>, ACCAT <dbl>, ACCCA <dbl>, ACCCC <dbl>,
# ACCCG <dbl>, ACCCT <dbl>, ACCGA <dbl>, ACCGC <dbl>, ACCGG <dbl>, ACCGT <dbl>, ACCTA <dbl>, ACCTC <dbl>,
# ACCTG <dbl>, ACCTT <dbl>, ACGAA <dbl>, ACGAC <dbl>, ACGAG <dbl>, ACGAT <dbl>, ACGCA <dbl>, ACGCC <dbl>,
# ACGCG <dbl>, ACGCT <dbl>, ACGGA <dbl>, ACGGC <dbl>, ACGGG <dbl>, ACGGT <dbl>, ACGTA <dbl>, ACGTC <dbl>,
# ACGTG <dbl>, ACGTT <dbl>, ACTAA <dbl>, ACTAC <dbl>, ACTAG <dbl>, ACTAT <dbl>, ACTCA <dbl>, …
Each row is a file and each column is a sequence, so now you can check sequence occurrence across files like so:
> seq_count_table %>% select(input_file, ATGCT) %>% filter(ATGCT > 0) %>% arrange(-ATGCT)
# A tibble: 67 x 2
input_file ATGCT
<chr> <dbl>
1 seqs_37.txt 5
2 seqs_23.txt 4
3 seqs_1.txt 3
4 seqs_29.txt 3
5 seqs_38.txt 3
6 seqs_46.txt 3
7 seqs_59.txt 3
8 seqs_71.txt 3
9 seqs_85.txt 3
10 seqs_91.txt 3
# … with 57 more rows
Hope it helps 