I have Flights data in which trips are present.
library(data.table)
library(stringr)
library(dplyr)
library(tidyr)
### Sample dataset ###
dataset <- data.frame("Trips"=c(1:30),"Connections"=c(
"[Flight-6][Flight-2][Flight-5][Flight-7]" ,
"[Flight-4][Flight-9][Flight-5][Flight-3]" ,
"[Flight-8][Flight-3][Flight-2][Flight-4]" ,
"[Flight-4][Flight-7][Flight-5][Flight-2]" ,
"[Flight-3][Flight-10][Flight-7][Flight-5]" ,
"[Flight-9][Flight-10][Flight-1][Flight-8]" ,
"[Flight-10][Flight-5][Flight-7][Flight-8]" ,
"[Flight-5][Flight-9][Flight-2][Flight-10]" ,
"[Flight-1][Flight-10][Flight-9][Flight-7]" ,
"[Flight-8][Flight-9][Flight-3][Flight-2]" ,
"[Flight-10][Flight-3][Flight-5][Flight-6]" ,
"[Flight-3][Flight-7][Flight-5][Flight-10]" ,
"[Flight-4][Flight-2][Flight-3][Flight-8]" ,
"[Flight-4][Flight-3][Flight-7][Flight-9]" ,
"[Flight-2][Flight-4][Flight-7][Flight-1]" ,
"[Flight-2][Flight-3][Flight-10][Flight-9]" ,
"[Flight-4][Flight-7][Flight-2][Flight-5]" ,
"[Flight-10][Flight-4][Flight-5][Flight-6]" ,
"[Flight-10][Flight-8][Flight-4][Flight-9]" ,
"[Flight-4][Flight-6][Flight-9][Flight-5]" ,
"[Flight-2][Flight-5][Flight-4][Flight-10]" ,
"[Flight-2][Flight-4][Flight-5][Flight-6]" ,
"[Flight-8][Flight-7][Flight-10][Flight-9]" ,
"[Flight-1][Flight-2][Flight-6][Flight-5]" ,
"[Flight-1][Flight-3][Flight-6][Flight-9]" ,
"[Flight-3][Flight-5][Flight-7][Flight-9]" ,
"[Flight-5][Flight-8][Flight-4][Flight-10]" ,
"[Flight-2][Flight-4][Flight-8][Flight-6]" ,
"[Flight-1][Flight-5][Flight-8][Flight-4]" ,
"[Flight-7][Flight-10][Flight-3][Flight-1]" ,
"[Flight-6][Flight-2][Flight-5][Flight-7]" ,
"[Flight-4][Flight-9][Flight-5][Flight-3]" ,
"[Flight-8][Flight-3][Flight-2][Flight-4]" ,
"[Flight-4][Flight-7][Flight-5][Flight-2]" ,
"[Flight-3][Flight-10][Flight-7][Flight-5]" ,
"[Flight-9][Flight-10][Flight-1][Flight-8]" ,
"[Flight-10][Flight-5][Flight-7][Flight-8]" ,
"[Flight-5][Flight-9][Flight-2][Flight-10]" ,
"[Flight-1][Flight-10][Flight-9][Flight-7]" ,
"[Flight-8][Flight-9][Flight-3][Flight-2]" ,
"[Flight-10][Flight-3][Flight-5][Flight-6]" ,
"[Flight-3][Flight-7][Flight-5][Flight-10]" ,
"[Flight-4][Flight-2][Flight-3][Flight-8]" ,
"[Flight-4][Flight-3][Flight-7][Flight-9]" ,
"[Flight-2][Flight-4][Flight-7][Flight-1]" ,
"[Flight-2][Flight-3][Flight-10][Flight-9]" ,
"[Flight-4][Flight-7][Flight-2][Flight-5]" ,
"[Flight-10][Flight-4][Flight-5][Flight-6]" ,
"[Flight-10][Flight-8][Flight-4][Flight-9]" ,
"[Flight-4][Flight-6][Flight-9][Flight-5]" ,
"[Flight-2][Flight-5][Flight-4][Flight-10]" ,
"[Flight-2][Flight-4][Flight-5][Flight-6]" ,
"[Flight-8][Flight-7][Flight-10][Flight-9]" ,
"[Flight-1][Flight-2][Flight-6][Flight-5]" ,
"[Flight-1][Flight-3][Flight-6][Flight-9]" ,
"[Flight-3][Flight-5][Flight-7][Flight-9]" ,
"[Flight-5][Flight-8][Flight-4][Flight-10]" ,
"[Flight-2][Flight-4][Flight-8][Flight-6]" ,
"[Flight-1][Flight-5][Flight-8][Flight-4]" ,
"[Flight-7][Flight-10][Flight-3][Flight-1]"
)) # Actual input is data.table from txt file with >2900000 trips
First I extracted all the connections i.e.
# for trip [Flight-6][Flight-2][Flight-5][Flight-7] connections are:- [Flight-6][Flight-2] ; [Flight-2][Flight-5] ; [Flight-5][Flight-7]
### Making Connectons from dataset ###
connections <- str_match_all(dataset$Connections,pattern = "(?=(\\[Flight-\\d+\\]\\[Flight-\\d+\\]))" )
conn <- unlist(connections,recursive = TRUE)
conn <-unique(conn)
conn<- data.frame(conn)
conn <- subset(conn,str_length(conn)>0) # All unique connections are created
Now, need to calculate Count of each connection in the dataset
conn1 <- str_replace_all(conn$conn,"\\[","\\\\[")
conn1 <- str_replace_all(conn1,"\\]","\\\\]") ### Made regex pattern for String matching
dataset_as_String <- toString(dataset$Connections) ## Converting dataset to String
### Counting each connections from dataset
ans = str_count(dataset_as_String,conn1)
But, I need to do the same for >17000 connections over >2900000 trips. Which is very slow process.
Further, I also need to calculate 4 other types: -
1.No. of trips starting with Flight-x
# Something like might be wrong
pattern <- "\\[Flight-\\d+\\]\\S+"
2.No. of trips ending with Flight-x
pattern <- "\\[Flight-\\d+\\]$" ## Might be wrong
3.Connections starting with Flight-x but not ending at Flight-y
pattern <- "\\[Flight-\\d+\\](?!\\[Flight-\\d+\\])"## Might be wrong
4.Connections not starting with Flight-x but ending at Flight-y
pattern <- "(?!\\[Flight-\\d+\\])\\[Flight-\\d+\\]" ## Might be wrong
where, x and y are all possible combinations.
How can I calculate all the 5 conditions as fast as possible.
Calculating time of <60 min is acceptable.
Time<15min, I'll be grateful.
Any help is appreciated.
Thank You for your time and concern.