How can I find the intersecting point between my two histograms?

I am following this wonderful post from Fong Chun Chan's blog - where you have a distribution that is bimodal (two apparent peaks).

The example code (pasted below) fits a Gaussian Mixed Model (GMM) and clarifies the two distributions.

library("mixtools")

#Define function
plot_mix_comps <- function(x, mu, sigma, lam) {
  lam * dnorm(x, mu, sigma)
}

#Example with `faithful` dataset
set.seed(1)
wait <- faithful$waiting
mixmdl <- normalmixEM(wait, k = 2)


#Apply function and plot mixed model
data.frame(x = mixmdl$x) %>%
  ggplot() +
  geom_histogram(aes(x, ..density..), binwidth = 1, colour = "black", 
                 fill = "white") +
  stat_function(geom = "line", fun = plot_mix_comps,
                args = list(mixmdl$mu[1], mixmdl$sigma[1], lam = mixmdl$lambda[1]),
                colour = "red", lwd = 1.5) +
  stat_function(geom = "line", fun = plot_mix_comps,
                args = list(mixmdl$mu[2], mixmdl$sigma[2], lam = mixmdl$lambda[2]),
                colour = "blue", lwd = 1.5) +
  ylab("Density")

The before and aftger look like this:

How can I find the point where these two density plots intersect? I've played around with intersect but it seems like I'm not having any luck and probably overestimated how simple this would be.

Thanks!

Shouldn't be too hard to adapt snippet below, which is used to find, and plot, local maxima. (Says the man who's too lazy to do it himself.)

suppressPackageStartupMessages({
  library(ggplot2)
})
df1 <- data.frame(
  x = c(
    16,
    16.564238781462,
    17.1483754005807,
    17.7531115530855,
    18.3791736799526,
    19.0273138400435,
    19.6983106135187,
    20.3929700371082,
    21.1121265723663,
    21.8566441080703,
    22.6274169979695,
    23.42537113513,
    24.2514650641664,
    25.106691132696,
    25.9920766833995,
    26.9086852881189,
    27.857618025476,
    28.8400148035466,
    29.8570557291778,
    30.9099625255951,
    32,
    33.1284775629241,
    34.2967508011614,
    35.506223106171,
    36.7583473599051,
    38.0546276800871,
    39.3966212270373,
    40.7859400742164,
    42.2242531447326,
    43.7132882161407,
    45.254833995939,
    46.85074227026,
    48.5029301283327,
    50.213382265392,
    51.9841533667991,
    53.8173705762377,
    55.7152360509519,
    57.6800296070931,
    59.7141114583557,
    61.8199250511901,
    64,
    66.2569551258482,
    68.5935016023228,
    71.0124462123421,
    73.5166947198102,
    76.1092553601741,
    78.7932424540747,
    81.5718801484328,
    84.4485062894653,
    87.4265764322813,
    90.5096679918781,
    93.70148454052,
    97.0058602566655,
    100.426764530784,
    103.968306733598,
    107.634741152475,
    111.430472101904,
    115.360059214186,
    119.428222916711,
    123.63985010238,
    128,
    132.513910251696,
    137.187003204646,
    142.024892424684,
    147.03338943962,
    152.218510720348,
    157.586484908149,
    163.143760296866,
    168.897012578931,
    174.853152864563,
    181.019335983756,
    187.40296908104,
    194.011720513331,
    200.853529061568,
    207.936613467196,
    215.269482304951,
    222.860944203808,
    230.720118428373,
    238.856445833423,
    247.279700204761,
    256,
    265.027820503393,
    274.374006409291,
    284.049784849368,
    294.066778879241,
    304.437021440697,
    315.172969816299,
    326.287520593731,
    337.794025157861,
    349.706305729125,
    362.038671967512,
    374.80593816208,
    388.023441026662,
    401.707058123136,
    415.873226934393,
    430.538964609902,
    445.721888407616,
    461.440236856745,
    477.712891666846,
    494.559400409521,
    512,
    530.055641006786,
    548.748012818582,
    568.099569698737,
    588.133557758482,
    608.874042881393,
    630.345939632597,
    652.575041187462,
    675.588050315722,
    699.412611458251
  ),
  y = c(
    0.000509477897425229,
    0.000527386888743664,
    0.000546053806503485,
    0.000563536347150758,
    0.000578694693557497,
    0.000592184934692638,
    0.000606880432664763,
    0.00062740434643328,
    0.000658951780340787,
    0.000706022263900694,
    0.000771695560505854,
    0.000857665701641295,
    0.000964821690647073,
    0.00109410977624778,
    0.00124762020201182,
    0.00142985018853286,
    0.00164871137836403,
    0.00191549532233234,
    0.00224333357803463,
    0.00264484132738257,
    0.00313097018738546,
    0.00371357432026318,
    0.004413116507083,
    0.00527051763162741,
    0.00635951029446201,
    0.00779442118237125,
    0.00972891845147851,
    0.0123436051467146,
    0.0158230846458223,
    0.0203248591277636,
    0.0259430773877464,
    0.0326716119103998,
    0.0403751637620158,
    0.0487822072022065,
    0.0575125639221519,
    0.0661386623990792,
    0.0742571414246156,
    0.0815347852470562,
    0.0877087303679976,
    0.0925592583028204,
    0.0958964121231122,
    0.0975778467891361,
    0.0975299735518198,
    0.0957428778093582,
    0.0922662141274413,
    0.0872674196185096,
    0.0811466237435986,
    0.0745892159235216,
    0.0684282888234455,
    0.0633408656175527,
    0.0595754289702118,
    0.0569149855310437,
    0.0549012076098515,
    0.0531548543217289,
    0.0515892280330073,
    0.0504152091574836,
    0.0499599635705786,
    0.0504031217012216,
    0.0515799825745852,
    0.0529930988557304,
    0.0540733795338386,
    0.0545753160201401,
    0.0548889928851148,
    0.0560788146361983,
    0.0595912907883012,
    0.0667366977351586,
    0.0781688004268401,
    0.0935992678733074,
    0.111863211025667,
    0.131261468491735,
    0.14996894361554,
    0.166304089020949,
    0.178792397549176,
    0.186142519356196,
    0.187370254991816,
    0.182238791895045,
    0.171908697734763,
    0.159376822267072,
    0.149232179773896,
    0.14661852192979,
    0.15582462684655,
    0.17916710287631,
    0.216598366309568,
    0.26603216591513,
    0.324083541154487,
    0.386820517224485,
    0.450113246364986,
    0.509323652377758,
    0.558573294272359,
    0.590463672842883,
    0.597264437175175,
    0.573745920439653,
    0.520380960986617,
    0.444758790690755,
    0.359750166225963,
    0.278987238175495,
    0.21206887381186,
    0.162091637731931,
    0.12651426193784,
    0.100279095709724,
    0.0790468365289463,
    0.0608671439409344,
    0.0459482418843581,
    0.0353311402777704,
    0.0295872989249962,
    0.0282323900120659,
    0.0298985530326881,
    0.032896184060171,
    0.0357555698838089,
    0.0375239368848331
  )
)

p <- ggplot(df1, aes(x, y)) +
  geom_point() +
  theme_minimal()
p

which(diff(sign(diff(df1$y))) == -2) + 1 -> positions
# positions
#[[[[ [1] 42 75 91
indices <- df1[positions, 1]
            p +
              geom_vline(xintercept = indices[1]) +
              geom_vline(xintercept = indices[2]) +
              geom_vline(xintercept = indices[3])

Created on 2021-01-05 by the reprex package (v0.3.0.9001)

1 Like

Dang - that sent me down a crazy rabbit hole.

All in all, it seems like finding local minima will not work (or rather, I can't get it to work). The reason seems to be because the local minima of either the original bimodal distribution or the mixed models is only slightly close to the point of intersect between the two.

Here's my example where I tried to apply this to my data. Plotting the local minima of the mixed models just results in tons of lines, but if I plot the local minima based on the original data, it definitely makes sense (that line is in the right place..........well, right enough). But this still isn't the point of intersect.

There should be some way take the example I started with above and find the intersecting point (says the man who has been stuck on this for too long).

Thus, my caveat about the man who didn't have to do it himself.

I had in mind the single density plot of the first post, which, by inspection had vline around 67. Post a dput of the data frame you're using?

If you know the mean and standard deviation of the two distributions, I think you can simply use the optimize function.

NormIntersect <- function(x, mu1, sig1, mu2, sig2) {
  (dnorm(x, mean = mu1, sd = sig1) - dnorm(x, mean = mu2, sd = sig2))^2
}
Intersect <- optimize(NormIntersect, interval =  c(54, 80), mu1 = 54, sig1 = 5, 
                      mu2 = 80, sig2 = 6)

plot(seq(40, 100, 0.1), dnorm(seq(40, 100, 0.1), 54, 5), type = "l")
lines(seq(40, 100, 0.1), dnorm(seq(40, 100, 0.1), 80, 6), type = "l")
abline(v = Intersect$minimum)

Created on 2021-01-05 by the reprex package (v0.3.0)

Could you illustrate this with mtcars or another dataset?

In the original post, the mixture component with the lower mean is plotted with

  stat_function(geom = "line", fun = plot_mix_comps,
                args = list(mixmdl$mu[1], mixmdl$sigma[1], lam = mixmdl$lambda[1]),
                colour = "red", lwd = 1.5)

where mixmdl has been calculated with the normalmixEM function and plot_mix_comps just returns dnorm weighted by lam. I was thinking that those same elements of mixmdl could be used to calculate the intersection of the two mixing components in the same way as in my toy example, though lam would have to be included in the function parameters.

OK. I was hoping for real data.

I think my main problem is that I don't know how to extract the two separate distributions when using the mixed model example above. I'm trying to figure out how to adapt @FJCC solution to my data (exact same scenario as example in original post - using the GMM

Hah! I cheated. (Now, if I can only figure out how to back into 7.5)

suppressPackageStartupMessages({
  library(dplyr)
  library(ggplot2)
  library(mixtools)
})

#Define function
plot_mix_comps <- function(x, mu, sigma, lam) {
  lam * dnorm(x, mu, sigma)
}

#Example with `faithful` dataset
set.seed(1)
wait <- faithful$waiting
mixmdl <- normalmixEM(wait, k = 2)
#> number of iterations= 29

x <- faithful$waiting
d <- data.frame(x = density(x)$x, y = density(x)$y)
which(diff(sign(diff(d$y))) == 2) + 7.5 -> positions
xline <- d[positions, 1]


data.frame(x = mixmdl$x) %>%
  ggplot() +
  geom_histogram(aes(x, ..density..),
    binwidth = 1, colour = "black",
    fill = "white"
  ) +
  stat_function(
    geom = "line", fun = plot_mix_comps,
    args = list(mixmdl$mu[1], mixmdl$sigma[1], lam = mixmdl$lambda[1]),
    colour = "red", lwd = 1.5
  ) +
  stat_function(
    geom = "line", fun = plot_mix_comps,
    args = list(mixmdl$mu[2], mixmdl$sigma[2], lam = mixmdl$lambda[2]),
    colour = "blue", lwd = 1.5
  ) +
  geom_vline(xintercept = xline) +
  ylab("Density") +
  theme_minimal()

Created on 2021-01-05 by the reprex package (v0.3.0.9001)

Woah, nice! I got this working on the faithful dataset but when I apply it to my data it still places the line where it was before when I was trying out your suggestion of plotting local minima....so weird. I can't understand where the difference is coming from! Here's what I'm doing:

library(readr)
mydat <- read.csv("https://srv-store5.gofile.io/download/tCUnge/mydat.csv") #My data
head(mydat)

numbers
1 0.01770059
2 0.01770059
3 0.01770059
4 0.05142874
5 0.06961057
6 0.06961057

#Define the GMM plotting function
plot_mix_comps <- function(x, mu, sigma, lam) {
  lam * dnorm(x, mu, sigma)
}

#GMM 
mixmdl <- normalmixEM(mydat$numbers, k = 2)

#@technocrat example
x = mydat$numbers
d <- data.frame(x = density(x)$x, y = density(x)$y)
which(diff(sign(diff(d$y))) == 2) + 7.5 -> positions
xline <- d[positions, 1]


#plot
data.frame(x = mixmdl$x) %>%
  ggplot() +
  geom_histogram(aes(x, ..density..),
                 binwidth = 1, colour = "black",
                 fill = "white"
  ) +
  stat_function(
    geom = "line", fun = plot_mix_comps,
    args = list(mixmdl$mu[1], mixmdl$sigma[1], lam = mixmdl$lambda[1]),
    colour = "red", lwd = 1.5
  ) +
  stat_function(
    geom = "line", fun = plot_mix_comps,
    args = list(mixmdl$mu[2], mixmdl$sigma[2], lam = mixmdl$lambda[2]),
    colour = "blue", lwd = 1.5
  ) +
  geom_vline(xintercept = xline) +
  ylab("Density") +
  theme_minimal()

I get this:

Like I said, I cheated. 7.5 is a magic number, based on trial and error. If it's just a matter of making it look right, try to find the right one for your case by increasing to 9, falling back to 8, etc.

I'm not sure whether this will work with your data but for the faithful dataset, it works well.
Basically, I make a plot and using ggplot_build function extract the coordinates used to draw the lines. Then I find all intersections as @technocrat suggested using diff(sign(diff(x)))==2. It returns not the actual intersection but close to it.

library(mixtools)
library(ggplot2)

#Define the GMM plotting function
plot_mix_comps <- function(x, mu, sigma, lam) {
  lam * dnorm(x, mu, sigma)
}

#GMM 
mixmdl <- normalmixEM(faithful$waiting, k = 2)


#save mixmdl$x as data.frame
mixmdl.df = data.frame(x = mixmdl$x)

#save a plot
the.plot = ggplot(mixmdl.df) +
  geom_histogram(aes(x, ..density..),
                 binwidth = 1, colour = "black",
                 fill = "white",
                 alpha = 0.1
  ) +
  stat_function(
    geom = "line", fun = plot_mix_comps,
    args = list(mixmdl$mu[1], mixmdl$sigma[1], lam = mixmdl$lambda[1]),
    colour = "red", lwd = 1.5
  ) +
  stat_function(
    geom = "line", fun = plot_mix_comps,
    args = list(mixmdl$mu[2], mixmdl$sigma[2], lam = mixmdl$lambda[2]),
    colour = "blue", lwd = 1.5
  ) +
  ylab("Density") +
  theme_minimal()

#extract coordinates of both lines from plot
line.df = data.frame(x = ggplot_build(the.plot)$data[[2]]$x,
                     red = ggplot_build(the.plot)$data[[2]]$y,
                     blue = ggplot_build(the.plot)$data[[3]]$y)

#find the minimal distance between lines along y axis
line.df$delta = line.df$red - line.df$blue

#find x coordinate for the minimal delta y
x_coord = line.df$x[which(diff(sign(diff((abs(line.df$delta))))) == 2)+1]

#plot
ggplot(mixmdl.df) +
  geom_histogram(aes(x, ..density..),
                 binwidth = 1, colour = "black",
                 fill = "white",
                 alpha = 0.1
  ) +
  stat_function(
    geom = "line", fun = plot_mix_comps,
    args = list(mixmdl$mu[1], mixmdl$sigma[1], lam = mixmdl$lambda[1]),
    colour = "red", lwd = 1.5
  ) +
  stat_function(
    geom = "line", fun = plot_mix_comps,
    args = list(mixmdl$mu[2], mixmdl$sigma[2], lam = mixmdl$lambda[2]),
    colour = "blue", lwd = 1.5
  ) +
  geom_vline(xintercept = x_coord) +
  ylab("Density") +
  theme_minimal()

Rplot