Tokenize a vector of strings into a dataframe

textmining
tidytext

#1

Hi,

I have a list of words that i wish to tokenize using the tokenizers package
I want to then get the result as a vector of characters which i then convert to a single comma seperated string so i can bind the column to my original data frame
Below i have it working for a single item in my dataframe but am not sure how to get it to work for the other items in my dataframe

Can anyone help?

library(tokenizers)
library(stringi)
library(tidyverse)
library(glue)
#> 
#> Attaching package: 'glue'
#> The following object is masked from 'package:dplyr':
#> 
#>     collapse

# Taken from https://stackoverflow.com/questions/42734547/generating-random-strings
random_letters <- sprintf("%s%s%s", stri_rand_strings(5, 5, '[A-Z]'),
                          stri_rand_strings(5, 4, '[0-9]'), stri_rand_strings(5, 1, '[A-Z]')) %>% 
  as_tibble()

# Get the tokenizing to work for one 
one_item_example <- random_letters$value[1]
test_col <- tokenize_character_shingles(one_item_example, n=4, n_min=4) %>% 
  unlist()
result <- glue_collapse(glue("'{test_col}'"), sep = ',')


result
#> 'cnvk','nvkz','vkz9','kz97','z978','9784','784x'


# Tokenize - BROKEN
tokenize_character_shingles(random_letters$value, n=4, n_min=4) %>% 
  bind_rows()
#> Error in bind_rows_(x, .id): Argument 1 must have names

Created on 2019-01-11 by the reprex package (v0.2.1)


#2

Please check your reprex, it seems like you have forgotten to load library(glue)


#3

Hi @andresrcs

Thanks for spotting the mistake. It was very late when i was putting it together. It should reflect my problem now

All the best


#4

I have managed to get what i want by wrapping it in a function
The answer below in case it might help anyone in future

library(tokenizers)
library(stringi)
library(tidyverse)
library(glue)
#> 
#> Attaching package: 'glue'
#> The following object is masked from 'package:dplyr':
#> 
#>     collapse

random_letters <- sprintf("%s%s%s", stri_rand_strings(5, 5, '[A-Z]'),
                          stri_rand_strings(5, 4, '[0-9]'), stri_rand_strings(5, 1, '[A-Z]')) %>% 
  as_tibble()

# Get the tokenizing to work for one 
one_item_example <- random_letters$value[1]

generate_or_statement <- function(mycol){
  
  test_col <- tokenize_character_shingles(mycol, n=4, n_min=4) %>% 
    unlist()
  
  result <- glue_collapse(glue("'{test_col}'"), sep = ',')
  
  result
  
}

# works for one item
result <- generate_or_statement(one_item_example)
result
#> 'qypx','ypxo','pxo7','xo71','o718','7186','186k'

# Work for all items
result <- map_chr(random_letters$value, generate_or_statement)
result
#> [1] "'qypx','ypxo','pxo7','xo71','o718','7186','186k'"
#> [2] "'aqoa','qoae','oae8','ae89','e899','8993','993c'"
#> [3] "'javg','avgc','vgc2','gc20','c201','2019','019h'"
#> [4] "'fptl','ptlf','tlf2','lf29','f295','2951','951p'"
#> [5] "'sgjo','gjop','jop9','op97','p971','9715','715n'"

closed #5

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.