Move files in a specific folder with df list

Hi community

I'm download 20.000 files .png with web scraping, and put in order to specific folder name. Each folder has 1 or 2 .png files.

For example folder G 1, contain two picture, for sed and pod. (see the link)

basic <- data.frame(ACCESION=c('G    1','G    7A', 'G35015', 'G40897','G27573'),
                    FOLDER= c( 'P_vulgaris', 'P_vulgaris','P_dumosus', 'P_albescens', 'P_lunatus'))

In basic of FOLDER I'm want create this folder that contain 2 other folder, for sed and pod but depend of FOLDER name.

For example the FOLDER P_vulgaris is necesary create 2 others folders, sed and pod. In sed folder I'm want put the sed picture of G 1 and pod folder put the pod picture.

I'm hope can explain in clarity form.

links of example:
https://drive.google.com/drive/folders/1di8b1L1i1lc2ZKTAWyrNiZNpxMmGcQAH?usp=share_link

tnks!

I'm assuming you have a function to download the correct file, of the form

download_picture <- function(accession, type = c("seed", "pod"), dir)

so that for example download_picture("G 1", "pod", "path/dir") will download the picture(s) of pod into path/dir, so create the file path/dir/G 1 pod.jpg.

In that case, you want to loop on the rows of your data frame, and for each row call a function that creates the directory structure for each row, and download the files, let's call it create_dir_and_download_files().

Then the looping itself can be done using purrr::walk2().

basic <- data.frame(ACCESION=c('G    1','G    7A', 'G35015', 'G40897','G27573'),
                    FOLDER= c( 'P_vulgaris', 'P_vulgaris','P_dumosus', 'P_albescens', 'P_lunatus'))


# Functions ----


download_picture <- function(accession, type = c("seed", "pod"), dir){
  # pretend we downloaded it
  
  filename <- paste0(accession, " ", type, ".jpg")
  writeLines(sample(stringr::words, 10), file.path(dir, filename))
}


create_dir_and_download_files <- function(accession, folder){
  
  # prepare the paths
  main_folder <- file.path("data", folder)
  seed_folder <- file.path(main_folder, "seed")
  pod_folder <- file.path(main_folder, "pod")
  
  # if it's the first time we see that folder name, create it
  if(! dir.exists(main_folder)){
    dir.create(main_folder)
  }
  if(! dir.exists(seed_folder)){
    dir.create(seed_folder)
  }
  if(! dir.exists(pod_folder)){
    dir.create(pod_folder)
  }
  
  # now download
  download_picture(accession, "seed", file.path("data", folder, "seed"))
  download_picture(accession, "pod", file.path("data", folder, "pod"))
}


# Main script ----

dir.create("data")

purrr::walk2(basic$ACCESION, basic$FOLDER, create_dir_and_download_files)

fs::dir_tree("data")
#> data
#> ├── P_albescens
#> │   ├── pod
#> │   │   └── G40897 pod.jpg
#> │   └── seed
#> │       └── G40897 seed.jpg
#> ├── P_dumosus
#> │   ├── pod
#> │   │   └── G35015 pod.jpg
#> │   └── seed
#> │       └── G35015 seed.jpg
#> ├── P_lunatus
#> │   ├── pod
#> │   │   └── G27573 pod.jpg
#> │   └── seed
#> │       └── G27573 seed.jpg
#> └── P_vulgaris
#>     ├── pod
#>     │   ├── G    1 pod.jpg
#>     │   └── G    7A pod.jpg
#>     └── seed
#>         ├── G    1 seed.jpg
#>         └── G    7A seed.jpg

Created on 2022-11-09 by the reprex package (v2.0.1)

1 Like

I'm try to avoid download of new because this process was 20 hrs. I don't know if exist the form to move this files with R.

The folder are in this form:

You code run very well
image

But the image don't open because are create.

Oh I didn't understand the question correctly.

So, if I understand correctly, you have 1 folder for each accession number with two pictures in each, and you want to group them by FOLDER?

I'll use the library {fs} which comes with the tidyverse and is practical for file manipulation.

library(fs)

Let's create example data, these are not actual jpeg files, just text files with a ".jpg" extension.

basic <- data.frame(ACCESION=c('G    1','G    7A', 'G35015', 'G40897','G27573'),
                    FOLDER= c( 'P_vulgaris', 'P_vulgaris','P_dumosus', 'P_albescens', 'P_lunatus'))

dir_pre <- "data"

dir_create(dir_pre)
purrr::walk(basic$ACCESION,
             ~ {
               dir_create(path(dir_pre, .x))
               writeLines(paste0("This file is for ",.x," sed"),
                          con =  path(dir_pre, .x, .x) |> paste0(" sed.jpg"))
               writeLines(paste0("This file is for ",.x," pod"),
                          con =  path(dir_pre, .x, .x) |> paste0(" pod.jpg"))
             })

fs::dir_tree(dir_pre)
#> data
#> ├── G    1
#> │   ├── G    1 pod.jpg
#> │   └── G    1 sed.jpg
#> ├── G    7A
#> │   ├── G    7A pod.jpg
#> │   └── G    7A sed.jpg
#> ├── G27573
#> │   ├── G27573 pod.jpg
#> │   └── G27573 sed.jpg
#> ├── G35015
#> │   ├── G35015 pod.jpg
#> │   └── G35015 sed.jpg
#> └── G40897
#>     ├── G40897 pod.jpg
#>     └── G40897 sed.jpg

Now that should look like the data you currently have. So we will create a new folder hierarchy and move the pictures in there:

dir_post <- "sorted_data"

dir_create(dir_post)
purrr::walk2(basic$ACCESION, basic$FOLDER,
             \(acc, fold) {
               
               # create the folders
               dir_create(path(dir_post, fold, "sed"),
                          recurse = TRUE)
               dir_create(path(dir_post, fold, "pod"))
               
               # move the files
               file_move(path(dir_pre, acc, acc) |> paste0(" sed.jpg"),
                         new_path = path(dir_post, fold, "sed", acc) |> paste0(" sed.jpg"))
               file_move(path(dir_pre, acc, acc) |> paste0(" pod.jpg"),
                         new_path = path(dir_post, fold, "pod", acc) |> paste0(" pod.jpg"))
             })

Now we can check the result, the old folder is empty:

fs::dir_tree(dir_pre)
#> data
#> ├── G    1
#> ├── G    7A
#> ├── G27573
#> ├── G35015
#> └── G40897

And the new folder contains all the images:

fs::dir_tree(dir_post)
#> sorted_data
#> ├── P_albescens
#> │   ├── pod
#> │   │   └── G40897 pod.jpg
#> │   └── sed
#> │       └── G40897 sed.jpg
#> ├── P_dumosus
#> │   ├── pod
#> │   │   └── G35015 pod.jpg
#> │   └── sed
#> │       └── G35015 sed.jpg
#> ├── P_lunatus
#> │   ├── pod
#> │   │   └── G27573 pod.jpg
#> │   └── sed
#> │       └── G27573 sed.jpg
#> └── P_vulgaris
#>     ├── pod
#>     │   ├── G    1 pod.jpg
#>     │   └── G    7A pod.jpg
#>     └── sed
#>         ├── G    1 sed.jpg
#>         └── G    7A sed.jpg

And since my fake images actually contain text, I can check they didn't get mixed up in the process:

readLines("sorted_data/P_dumosus/pod/G35015 pod.jpg")
#> [1] "This file is for G35015 pod"

Created on 2022-11-10 by the reprex package (v2.0.1)

1 Like

I hope explain very well the situation.
I have a questions, in which part Im need put path of contains the folders.

# This is the path of folder that contain the other folders.
C:\Users\macosta\Downloads\DescargaBeans_20221102\Descarga_20221102

The folder is this:

image

In which part of you @AlexisW script I need put this path?

-- is the first time in this topics for me --

In my script, dir_pre and dir_post are variables with the path to the base folder. You can see in the tree outputs that the "old folder", dir_pre, was indeed called data, while the new folder, dir_post, was called data_sorted.

So it should work if you ran

library(fs)
dir_pre <- "C:/Users/macosta/Downloads/DescargaBeans_20221102/Descarga_20221102"
dir_post <- "C:/Users/macosta/Downloads/DescargaBeans_20221102/sorted_Descarga_20221102"

dir_create(dir_post)
purrr::walk2(basic$ACCESION, basic$FOLDER,
             \(acc, fold) {
               
               # create the folders
               dir_create(path(dir_post, fold, "sed"),
                          recurse = TRUE)
               dir_create(path(dir_post, fold, "pod"))
               
               # move the files
               file_move(path(dir_pre, acc, acc) |> paste0(" sed.jpg"),
                         new_path = path(dir_post, fold, "sed", acc) |> paste0(" sed.jpg"))
               file_move(path(dir_pre, acc, acc) |> paste0(" pod.jpg"),
                         new_path = path(dir_post, fold, "pod", acc) |> paste0(" pod.jpg"))
             })

Make sure you test your script on example data before going for the full dataset. If you want to be really safe you can replace file_move() with file_copy(), so the old data doesn't get erased. However this would be slower, and double the amount of storage needed.

2 Likes

Hi @AlexisW , when I run the code show this.

Im try with other folder but is the same error:

Maybe are the spaces in G because for example G 1 has 4 spaces.

Im try with // but is the same problem.

This create a sorted_Descarga_20221102 with P_vulgaris folder with 2 more folders, pod and sed, but these are empty.

To me this suggests a mismatch between the content of basic and the actual files in "Descarga_20221102". More precisely, my guess is that the file "C:/Users/macosta/Downloads/DescargaBeans_20221102/Descarga_20221102/G 1/G 1 sed.jpg" does not exist.

Also something weird, in basic, I think you have 4 spaces in G 1, but in the error message it looks like there is only 1 space in G 1.

The most particular is that the file exist. In drive link Im put and example. In addition, the .jpg file has the same spaces names.

Im a little confused :thinking:

Im check the data frame (basic) and has the same organization names.

I'd say best would be to check directly whether R can find the files based only on the information in basic.

What is the result of this?

all_paths <- purrr::map2_chr(basic$ACCESION, basic$FOLDER,
                             \(acc, fold) {
                               path(dir_pre, acc, acc) |> paste0(" sed.jpg")
                             })
all_paths

file.exists(all_paths)
1 Like

Hi @AlexisW, is the result:

> all_paths
[1] "C:/Users/macosta/Downloads/DescargaBeans_20221102/Descarga_20221102/G    1/G    1 sed.jpg"  
[2] "C:/Users/macosta/Downloads/DescargaBeans_20221102/Descarga_20221102/G    7A/G    7A sed.jpg"
[3] "C:/Users/macosta/Downloads/DescargaBeans_20221102/Descarga_20221102/G35015/G35015 sed.jpg"  
[4] "C:/Users/macosta/Downloads/DescargaBeans_20221102/Descarga_20221102/G40897/G40897 sed.jpg"  
[5] "C:/Users/macosta/Downloads/DescargaBeans_20221102/Descarga_20221102/G27573/G27573 sed.jpg"

For pod:

all_paths
[1] "C:/Users/macosta/Downloads/DescargaBeans_20221102/Descarga_20221102/G    1/G    1 pod.jpg"  
[2] "C:/Users/macosta/Downloads/DescargaBeans_20221102/Descarga_20221102/G    7A/G    7A pod.jpg"
[3] "C:/Users/macosta/Downloads/DescargaBeans_20221102/Descarga_20221102/G35015/G35015 pod.jpg"  
[4] "C:/Users/macosta/Downloads/DescargaBeans_20221102/Descarga_20221102/G40897/G40897 pod.jpg"  
[5] "C:/Users/macosta/Downloads/DescargaBeans_20221102/Descarga_20221102/G27573/G27573 pod.jpg" 

And what is the result of file.exists(all_paths)?

Hi,
for seed is:

file.exists(all_paths)
[1] FALSE FALSE FALSE FALSE FALSE

For pod is:

file.exists(all_paths)
[1]  TRUE FALSE  TRUE  TRUE  TRUE

(oops I had no Internet for the last week)

So that result is pretty clear, the paths for "sed" are not correct, while those for "pod" are (except the second one). So you need to compare the paths generated from basic to the real paths on your computer, and make a corrected version of basic.

Alternatively, if it's too much work and if the paths are consistent, it could be possible to generate basic based on the actual filesystem.

Hi @AlexisW thanks for you interest. Im find the solution. In the next 4 hr Im could put the for you check.
:ok_hand:t4:

Hi @AlexisW, Im find this solution. Maybe dont is the most easy but for now is well for me.

  1. Get the names of directories
main_dir <- ('C:\\Users\\macosta\\Downloads\\CORE2')
dir_list <- as.data.frame(list.dirs(main_dir,recursive = T))   

# 1     C:\\Users\\macosta\\Downloads\\CORE2\\G   57
# 3     C:\\Users\\macosta\\Downloads\\CORE2\\G   76
# 4     C:\\Users\\macosta\\Downloads\\CORE2\\G   87
# 5     C:\\Users\\macosta\\Downloads\\CORE2\\G   92
# 6     C:\\Users\\macosta\\Downloads\\CORE2\\G  148
# 7     C:\\Users\\macosta\\Downloads\\CORE2\\G  166
  1. Get only the folder name
dir_list2 <- as.data.frame(list.dirs(main_dir,recursive = T,full.names = FALSE )) 

# 2                                                    G   57
# 3                                                    G   76
# 4                                                    G   87
# 5                                                    G   92
# 6                                                    G  148
# 7                                                    G  166
  1. Merge the names and add to .jpg format (the pictures files has the same names of each folder)
dir_list$NEW <- paste0(dir_list$`list.dirs(main_dir, recursive = T)`,"\\",
                       dir_list2$`list.dirs(main_dir, recursive = T, full.names = FALSE)` ," ", "seed.jpg")

# [2] "C:\\Users\\macosta\\Downloads\\CORE2\\G   57\\G   57 seed.jpg"  
# [3] "C:\\Users\\macosta\\Downloads\\CORE2\\G   76\\G   76 seed.jpg"  
# [4] "C:\\Users\\macosta\\Downloads\\CORE2\\G   87\\G   87 seed.jpg"  
# [5] "C:\\Users\\macosta\\Downloads\\CORE2\\G   92\\G   92 seed.jpg"  
# [6] "C:\\Users\\macosta\\Downloads\\CORE2\\G  148\\G  148 seed.jpg"  
# [7] "C:\\Users\\macosta\\Downloads\\CORE2\\G  166\\G  166 seed.jpg"  
  1. Im filter the names of each ACCESION in each FOLDER. And create in manual form each one. And in each one the two other folder, seed and pod.

image
image

  1. Fit the names for use in a function. Add ' in start and final of each path, , for separate each one.
dir_list$NEW<- paste0("'", dir_list$NEW,"'", ",")

# I have problems with \\ and \. But is easy to change with replace tools of Rstudio.
# [2] "'C:\\Users\\macosta\\Downloads\\CORE2\\G   76\\G   76 seed.jpg',"  
# [3] "'C:\\Users\\macosta\\Downloads\\CORE2\\G   87\\G   87 seed.jpg',"  
# [4] "'C:\\Users\\macosta\\Downloads\\CORE2\\G   92\\G   92 seed.jpg',"  
# [5] "'C:\\Users\\macosta\\Downloads\\CORE2\\G  148\\G  148 seed.jpg',"  
# [6] "'C:\\Users\\macosta\\Downloads\\CORE2\\G  166\\G  166 seed.jpg',"  
# [7] "'C:\\Users\\macosta\\Downloads\\CORE2\\G  169\\G  169 seed.jpg'"  #delete the (,) in final object. 
  1. Run this code
# Im use this example 
# https://stackoverflow.com/questions/71601557/moving-and-copying-multiple-files
df = data.frame(                                            
  source = c('C:\\Users\\macosta\\Downloads\\CORE2\\G   57\\G   57 seed.jpg',
             'C:\\Users\\macosta\\Downloads\\CORE2\\G   76\\G   76 seed.jpg',
             'C:\\Users\\macosta\\Downloads\\CORE2\\G   87\\G   87 seed.jpg',
             'C:\\Users\\macosta\\Downloads\\CORE2\\G   92\\G   92 seed.jpg',
             'C:\\Users\\macosta\\Downloads\\CORE2\\G  148\\G  148 seed.jpg',
             'C:\\Users\\macosta\\Downloads\\CORE2\\G  166\\G  166 seed.jpg',
             'C:\\Users\\macosta\\Downloads\\CORE2\\G  169\\G  169 seed.jpg'),
destination = c('C:\\Users\\macosta\\Downloads\\CORE_NEW_ORDER\\P_Vulgaris\\seed', 
                  'C:\\Users\\macosta\\Downloads\\CORE_NEW_ORDER\\P_Vulgaris\\seed',
                  'C:\\Users\\macosta\\Downloads\\CORE_NEW_ORDER\\P_Vulgaris\\seed',
                  'C:\\Users\\macosta\\Downloads\\CORE_NEW_ORDER\\P_Vulgaris\\seed',
                  'C:\\Users\\macosta\\Downloads\\CORE_NEW_ORDER\\P_Vulgaris\\seed',
                  'C:\\Users\\macosta\\Downloads\\CORE_NEW_ORDER\\P_Vulgaris\\seed',
                  'C:\\Users\\macosta\\Downloads\\CORE_NEW_ORDER\\P_Vulgaris\\seed')

file.copy(from = df$source, to = file.path(df$destination, basename(df$source)))

# destination  is the new folder.
  1. For P_Vulgaris this are the files in seed.

1 Like

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.