import multiple nested JSON files into R

Dear All,

I have several .json files which I would like to read into R. I have found a helpful approach (since I also have null values) at stackexchange: https://datascience.stackexchange.com/questions/11987/how-to-read-several-json-files-to-a-dataframe-in-r/11989#11989?newreg=4afddfe853a54b23b19f56d4046961f8

That works quite well in itself. However, I have nested json objects.

An exemplary data set:

{"content":"VIDEO","progress":30,"userData":{"id":"BE7C","age":"54","sex":"m"},"answersBefore":{"0_0":1,"0_1":2,"0_2":2,"0_3":1,"0_4":1,"0_5":1,"1_1":2,"1_2":6,"1_0":3,"1_3":1,"1_4":4,"1_5":5,"2_0":2,"2_1":2,"3_0":4,"3_1":6,"3_2":6,"4_0":2,"5_0":7,"5_1":5,"5_2":5,"5_3":7,"5_4":6,"6_0":2,"7_0":7},"answersAfterwards":{"0_0":2,"0_1":1,"0_2":2,"0_3":1,"0_4":2,"0_5":1,"1_0":2,"1_1":1,"1_2":4,"1_3":2,"1_4":3,"1_5":3,"2_0":2,"2_1":2,"3_0":2,"3_1":4,"3_2":4,"4_0":2,"5_0":7,"5_1":5,"5_2":5,"5_3":7,"5_4":6,"6_0":2,"8_0":2,"8_1":2,"8_2":5,"9_0":7},"timeStamps":{"start":"2021-09-08T09:11:03.610Z","firstQuestionnaireFinished":"2021-09-08T09:30:13.814Z","contentFinished":"2021-09-08T09:36:13.377Z","secondQuestionnaireFinished":"2021-09-08T09:38:24.230Z"},"points":0}

For example, the variable "userData" stores values of "id", "age", "sex".

If I use the code as in stackexchange, these "nested variables" are not listed. Instead, only the first value (here the value of id) is stored under userData.

Can anyone tell me what I can change in the code to just store the "nested data" in the dataframe?

Thanks already!

Welcome to the community. As for example dataset (a json), how would you like to have it in dataframe? You like to have it in a single row of a dataframe (like column names would be content, progress, userData.id, userData.age etc.) or a dataframe with multiple rows for your example dataset? Thanks

Instead of using base R for importing JSON files, it is best to use a package called rjson. This will import the JSON file or text as a list, including nested elements. For example, using rjson and tidyverse to obtain the before and after answers in a tibble. Unfortunately, as answers before and after are different lengths, this complicates things and I had to create the tibbles independently then bind them. There will be NA values.

# Install and load the package required to read JSON files.
# install.packages("rjson")
library(rjson)
# Install and load tidyverse
# install.packages(tidyverse)
library(tidyverse)

# Read the JSON text
text='{"content":"VIDEO","progress":30,"userData":{"id":"BE7C","age":"54","sex":"m"},"answersBefore":{"0_0":1,"0_1":2,"0_2":2,"0_3":1,"0_4":1,"0_5":1,"1_1":2,"1_2":6,"1_0":3,"1_3":1,"1_4":4,"1_5":5,"2_0":2,"2_1":2,"3_0":4,"3_1":6,"3_2":6,"4_0":2,"5_0":7,"5_1":5,"5_2":5,"5_3":7,"5_4":6,"6_0":2,"7_0":7},"answersAfterwards":{"0_0":2,"0_1":1,"0_2":2,"0_3":1,"0_4":2,"0_5":1,"1_0":2,"1_1":1,"1_2":4,"1_3":2,"1_4":3,"1_5":3,"2_0":2,"2_1":2,"3_0":2,"3_1":4,"3_2":4,"4_0":2,"5_0":7,"5_1":5,"5_2":5,"5_3":7,"5_4":6,"6_0":2,"8_0":2,"8_1":2,"8_2":5,"9_0":7},"timeStamps":{"start":"2021-09-08T09:11:03.610Z","firstQuestionnaireFinished":"2021-09-08T09:30:13.814Z","contentFinished":"2021-09-08T09:36:13.377Z","secondQuestionnaireFinished":"2021-09-08T09:38:24.230Z"},"points":0}'

json=fromJSON(text)# Can also use the file parameter to read from a file `file="test.json"`

# Access some of the data
json$userData$age
#> [1] "54"
json$content
#> [1] "VIDEO"

# Create the answers before data and convrt it into a tibble
before=as_tibble(
    json$answersBefore
)%>%
    # add the time colum with the value before
    mutate(time="berfore")
before
#> # A tibble: 1 x 26
#>   `0_0` `0_1` `0_2` `0_3` `0_4` `0_5` `1_1` `1_2` `1_0` `1_3` `1_4` `1_5` `2_0`
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1     1     2     2     1     1     1     2     6     3     1     4     5     2
#> # ... with 13 more variables: `2_1` <dbl>, `3_0` <dbl>, `3_1` <dbl>,
#> #   `3_2` <dbl>, `4_0` <dbl>, `5_0` <dbl>, `5_1` <dbl>, `5_2` <dbl>,
#> #   `5_3` <dbl>, `5_4` <dbl>, `6_0` <dbl>, `7_0` <dbl>, time <chr>

# Create the answers after data and convrt it into a tibble
after=as_tibble(
    json$answersAfterwards
)%>%
    # add the time colum with the value after
    mutate(time="after")
after
#> # A tibble: 1 x 29
#>   `0_0` `0_1` `0_2` `0_3` `0_4` `0_5` `1_0` `1_1` `1_2` `1_3` `1_4` `1_5` `2_0`
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1     2     1     2     1     2     1     2     1     4     2     3     3     2
#> # ... with 16 more variables: `2_1` <dbl>, `3_0` <dbl>, `3_1` <dbl>,
#> #   `3_2` <dbl>, `4_0` <dbl>, `5_0` <dbl>, `5_1` <dbl>, `5_2` <dbl>,
#> #   `5_3` <dbl>, `5_4` <dbl>, `6_0` <dbl>, `8_0` <dbl>, `8_1` <dbl>,
#> #   `8_2` <dbl>, `9_0` <dbl>, time <chr>

# Bind the rows into one tibble
bind_rows(
    before, after
)
#> # A tibble: 2 x 30
#>   `0_0` `0_1` `0_2` `0_3` `0_4` `0_5` `1_1` `1_2` `1_0` `1_3` `1_4` `1_5` `2_0`
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1     1     2     2     1     1     1     2     6     3     1     4     5     2
#> 2     2     1     2     1     2     1     1     4     2     2     3     3     2
#> # ... with 17 more variables: `2_1` <dbl>, `3_0` <dbl>, `3_1` <dbl>,
#> #   `3_2` <dbl>, `4_0` <dbl>, `5_0` <dbl>, `5_1` <dbl>, `5_2` <dbl>,
#> #   `5_3` <dbl>, `5_4` <dbl>, `6_0` <dbl>, `7_0` <dbl>, time <chr>,
#> #   `8_0` <dbl>, `8_1` <dbl>, `8_2` <dbl>, `9_0` <dbl>

Created on 2021-10-01 by the reprex package (v0.3.0)

Hi there :slight_smile: Thank you for your reply.

I think for now. I would be happy to have one column for each json object like you described "content, progress, userData.id, userData.age etc." (wide-format). Later on, after I renamed the columns for "answersBefore" and "answersAfter", I would like to be able to transform the data set to a long-format.

In the wide-format, each row consists of answers to a before and after questionnaire. from one participant.
In the long-format, I would like to have two rows per participants, one for the before and one for the after answers since they are basically the same except for some extra questions.

Can you help?

Thank you for your suggestion. I used your code on all json objects and bound the tibbles together using bind_cols as you can see here:


# Create a tibble for every json object in order to unnest data 
content=as_tibble(
   json$content
   )%>%
      # change column name
      rename(condition = value)

progress=as_tibble(
   json$progress
   )%>%
      rename(progress = value)

userdata=as_tibble(json$userData)

before <- as_tibble(json$answersBefore)%>%
   rename(intention1_1 = "0_0",
          intention2_1 = "0_1",
          intention3_1 = "0_2",
          intention4_1 = "0_3",
          intention5_1 = "0_4",
          intention6_1 = "0_5",
          response_efficacy1_1 = "1_0",
          response_efficacy2_1 = "1_1",
          response_efficacy3_1 = "1_2",
          response_efficacy4_1 = "1_3",
          response_efficacy5_1 = "1_4",
          response_efficacy6_1 = "1_5",
          vulnerability_1 = "2_0",
          severity_1 = "2_1",
          gen_self_efficacy1_1 = "3_0",
          gen_self_efficacy2_1 = "3_1",
          gen_self_efficacy3_1 = "3_2",
          negative_affect_1 = "4_0",
          spec_knowledge1_1 = "5_0",
          spec_knowledge2_1 = "5_1",
          spec_knowledge3_1 = "5_2",
          spec_self_efficacy1_1 = "5_3",
          spec_self_efficacy2_1 = "5_4",
          collective_engagement_interest_1 ="6_0",
          technical_affinity ="7_0")

after=as_tibble(json$answersAfterwards)%>%
   rename(intention1_2 = "0_0",
          intention2_2 = "0_1",
          intention3_2 = "0_2",
          intention4_2 = "0_3",
          intention5_2 = "0_4",
          intention6_2 = "0_5",
          response_efficacy1_2 = "1_0",
          response_efficacy2_2 = "1_1",
          response_efficacy3_2 = "1_2",
          response_efficacy4_2 = "1_3",
          response_efficacy5_2 = "1_4",
          response_efficacy6_2 = "1_5",
          vulnerability_2 = "2_0",
          severity_2 = "2_1",
          gen_self_efficacy1_2 = "3_0",
          gen_self_efficacy2_2 = "3_1",
          gen_self_efficacy3_2 = "3_2",
          negative_affect_2 = "4_0",
          spec_knowledge1_2 = "5_0",
          spec_knowledge2_2 = "5_1",
          spec_knowledge3_2 = "5_2",
          spec_self_efficacy1_2 = "5_3",
          spec_self_efficacy2_2 = "5_4",
          collective_engagement_interest_2 ="6_0",
          enjoyment ="8_0",
          mastery = "8_1",
          meaning = "8_2",
          interest = "8_3",
          autonomy = "8_4",
          concentration = "9_0")

points=as_tibble(json$points)%>%
   rename(points = value)

timestamps=as_tibble(json$timeStamps)

bind_cols(content,progress,userdata,before,after, points, timestamps)

which gives me exactly what I want, a single row with all data that was stored in that json-file. It represents the answers given by one person that participated in my survey. So one row equals one participant.

Now I am wondering how I can do that to all my json- files in my in my data-folder (I have 198 more of these files) and successively and row-wise add them to that tibble.

Do you know, how I can do that?

Thank you!