how to subset a dataset properly (incorrect result or bad code?)

mmarion · March 28, 2023, 3:37am

I can't figure out why the number of outliers is always 0 when the number of non outliers is not zero. It makes no difference in the size of the dataset df2 the end result is always the same??? Am I reading this wrong?

I am following an example:

# create a new dataframe that contains only those rows 
# that have a z-score of below 3
new_data <- subset(data, data$zscore < 3)

My data sets:

dput(head(df2))
structure(list(id = 1:6, id2 = 0:5, z = c(0L, 0L, 0L, 0L, 0L,
0L), x1 = c(7.4, 7.8, 7.8, 11.2, 7.4, 7.4), x2 = c(0.7, 0.88,
0.76, 0.28, 0.7, 0.66), x3 = c(0, 0, 0.04, 0.56, 0, 0), x4 = c(1.9,
2.6, 2.3, 1.9, 1.9, 1.8), x5 = c(0.076, 0.098, 0.092, 0.075,
0.076, 0.075), x6 = c(11, 25, 15, 17, 11, 13), x7 = c(34, 67,
54, 60, 34, 40), x8 = c(0.9978, 0.9968, 0.997, 0.998, 0.9978,
0.9978), x9 = c(3.51, 3.2, 3.26, 3.16, 3.51, 3.51), x10 = c(0.56,
0.68, 0.65, 0.58, 0.56, 0.56), x11 = c(9.4, 9.8, 9.8, 9.8, 9.4,
9.4), y = c(5L, 5L, 5L, 6L, 5L, 5L), y2 = c(0L, 0L, 0L, 1L, 0L,
0L), y3 = c(1L, 1L, 1L, 2L, 1L, 1L)), row.names = c(NA, 6L), class = "data.frame")

dput(head(df3))
structure(list(z = c(-1.75005514316603, -1.75005514316603, -1.75005514316603,
-1.75005514316603, -1.75005514316603, -1.75005514316603), x1 = c(0.142462300205994,
0.451001010798382, 0.451001010798382, 3.07358005083368, 0.142462300205994,
0.142462300205994), x2 = c(2.18866446400268, 3.28198233904062,
2.55310375568199, -0.362410577752516, 2.18866446400268, 1.94570493621647
), x3 = c(-2.19266375510471, -2.19266375510471, -1.91740510037435,
1.6609574111204, -2.19266375510471, -2.19266375510471), x4 = c(-0.744720785192258,
-0.597594077620892, -0.660648380865763, -0.744720785192258, -0.744720785192258,
-0.765738886273882), x5 = c(0.569913952190335, 1.19788250632519,
1.0266183551975, 0.541369927002388, 0.569913952190335, 0.541369927002388
), x6 = c(-1.10005519223097, -0.311296125454904, -0.87469545886638,
-0.762015592184085, -1.10005519223097, -0.987375325548675), x7 = c(-1.44624721020492,
-0.862402248309921, -1.09240177875341, -0.986248149317952, -1.44624721020492,
-1.34009358076947), x8 = c(1.03491316497404, 0.701432322361402,
0.768128490883923, 1.10160933349656, 1.03491316497404, 1.03491316497404
), x9 = c(1.81294997139708, -0.11506417365602, 0.258099854418771,
-0.36384019237255, 1.81294997139708, 1.81294997139708), x10 = c(0.193081910246498,
0.999501691167798, 0.797896745937473, 0.327485207066714, 0.193081910246498,
0.193081910246498), x11 = c(-0.915393708652846, -0.58002349000728,
-0.58002349000728, -0.58002349000728, -0.915393708652846, -0.915393708652846
), y = c(-0.937157483579359, -0.937157483579359, -0.937157483579359,
0.207983041305932, -0.937157483579359, -0.937157483579359)), row.names = c(NA,
6L), class = "data.frame")

I have 11 variables which I am checking one at a time for outliers. I am repeating this type of statement.

df  <- read.csv("df.csv",header=TRUE)
df2 <- read.csv("df2.csv",header=TRUE)
df3 <- read.csv("df3.csv",header=TRUE)

data <- df2
names(data)

data$x1z <- df3$x1
data$x2z <- df3$x2
data$x3z <- df3$x3
data$x4z <- df3$x4
data$x5z <- df3$x5
data$x6z <- df3$x6
data$x7z <- df3$x7
data$x8z <- df3$x8
data$x9z <- df3$x9
data$x10z <- df3$x10
data$x11z <- df3$x11
names(data)
dim(data)

# keep only rows with no outliers
datakeep <- data
datakeep <- subset(datakeep,abs(datakeep$x1z)<= 3)
datakeep <- subset(datakeep,abs(datakeep$x2z)<= 3)
datakeep <- subset(datakeep,abs(datakeep$x3z)<= 3)
datakeep <- subset(datakeep,abs(datakeep$x4z)<= 3)
datakeep <- subset(datakeep,abs(datakeep$x5z)<= 3)
datakeep <- subset(datakeep,abs(datakeep$x6z)<= 3)
datakeep <- subset(datakeep,abs(datakeep$x7z)<= 3)
datakeep <- subset(datakeep,abs(datakeep$x8z)<= 3)
datakeep <- subset(datakeep,abs(datakeep$x9z)<= 3)
datakeep <- subset(datakeep,abs(datakeep$x10z)<= 3)
datakeep <- subset(datakeep,abs(datakeep$x11z)<= 3)

dim(datakeep) #6009 x 28 or 938 outliers
names(datakeep)

create a new dataframe that contains only those rows

that have a z-score > 3


df  <- read.csv("df.csv",header=TRUE)
df2 <- read.csv("df2.csv",header=TRUE)
df3 <- read.csv("df3.csv",header=TRUE)

data <- df2
names(data)
data$x1z <- df3$x1
data$x2z <- df3$x2
data$x3z <- df3$x3
data$x4z <- df3$x4
data$x5z <- df3$x5
data$x6z <- df3$x6
data$x7z <- df3$x7
data$x8z <- df3$x8
data$x9z <- df3$x9
data$x10z <- df3$x10
data$x11z <- df3$x11
names(data)
dim(data)

# keep only rows with outliers
datakeep2 <- data
datakeep2 <- subset(datakeep2,abs(datakeep2$x1z)>3)
datakeep2 <- subset(datakeep2,abs(datakeep2$x2z)>3)
datakeep2 <- subset(datakeep2,abs(datakeep2$x3z)>3)
datakeep2 <- subset(datakeep2,abs(datakeep2$x4z)>3)
datakeep2 <- subset(datakeep2,abs(datakeep2$x5z)>3)
datakeep2 <- subset(datakeep2,abs(datakeep2$x6z)>3)
datakeep2 <- subset(datakeep2,abs(datakeep2$x7z)>3)
datakeep2 <- subset(datakeep2,abs(datakeep2$x8z)>3)
datakeep2 <- subset(datakeep2,abs(datakeep2$x9z)>3)
datakeep2 <- subset(datakeep2,abs(datakeep2$x10z)>3)
datakeep2 <- subset(datakeep2,abs(datakeep2$x11z)>3)

dim(datakeep2) # 0 x 28
names(datakeep2)
head(datakeep2,25)

FJCC · March 28, 2023, 5:06am

You are asking for rows where x1Z > 3 AND x2z > 3 AND x3z > 3 etc.
If you post the output of

dput(head(df2))

and

dput(head(df3))

someone can probably suggest a good solution to select the rows with outliers.

Leon · March 28, 2023, 7:21am

Does this help?

# Define outliers as is done in default boxplots
is_outlier_q <- function(x){
  lower <- quantile(x, probs = c(0.25)) - 1.5 * IQR(x)
  upper <- quantile(x, probs = c(0.75)) + 1.5 * IQR(x)
  return( x < lower | upper < x )
}

# Define outliers using standard scores
is_outlier_z <- function(x, z_lim = 3){
  z <- scale(x)[,1]
  return( abs(z) > z_lim )
}

# Generate example data
my_data <- data.frame(
  x = rnorm(1000),
  y = rnorm(1000)
)

# Do subsets of example data
subset(my_data, is_outlier_q(x))
subset(my_data, is_outlier_q(x) | is_outlier_q(y))
subset(my_data, is_outlier_z(x))
subset(my_data, is_outlier_z(x) | is_outlier_z(y))

...and be aware, that using the standard score approach may not give you what you think it does. E.g. removing all abs(z) > 3 and then re-scaling the original data can produce new observations, where abs(z) > 3, because you now have a smaller variance in the data. I have seen people employing an iterative approach, where they repeat the procedure until all abs(z) < z_lim.

In any case, it is important to think about what is an outlier contextually and why do you want to remove them.

mmarion · March 28, 2023, 11:47am

Leon:

# Define outliers as is done in default boxplots
is_outlier_q <- function(x){
  lower <- quantile(x, probs = c(0.25)) - 1.5 * IQR(x)
  upper <- quantile(x, probs = c(0.75)) + 1.5 * IQR(x)
  return( x < lower | upper < x )
}

# Define outliers using standard scores
is_outlier_z <- function(x, z_lim = 3){
  z <- scale(x)[,1]
  return( abs(z) > z_lim )
}

# Generate example data
my_data <- data.frame(
  x = rnorm(1000),
  y = rnorm(1000)
)

# Do subsets of example data
subset(my_data, is_outlier_q(x))
subset(my_data, is_outlier_q(x) | is_outlier_q(y))
subset(my_data, is_outlier_z(x))
subset(my_data, is_outlier_z(x) | is_outlier_z(y))

I like it. I am using set.seed(12345).
You say rescaling is a problem. Can you expand on that? Why would I want to rescale?
How would you recommend I find the nonoutliers? I just changed the return statements.
I am getting different answers as I try different methods . I am studying your comments.
This is more complicated than I thought. Thank you for your reply. T

system · May 9, 2023, 11:47am

This topic was automatically closed 42 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.