How to fit two separate curves through points?

Hello there RStudio Community,

In the following example, you can clearly see that there are two separate groups within all the points. How can I fit two curves through them?

Thank you!

library(ggplot2)

DF <- structure(list(X = c(28929, 28058, 26889, 22683, 17083, 11682,
                          8389, 6937, 8657, 12391, 10983, 8167, 5446, 2361, 313, 4833,
                          10012, 5919, 1343, 112, 13061, 23623, 13712, 4221, 670, 8843,
                          29054, 23020, 18785, 20067, 26936, 21327, 16158, 10539, 5303,
                          1436, 26348, 27128, 27479, 20633, 12875, 6868, 2324, 6169, 21883,
                          19960, 20053, 16112, 13644, 12150, 10765, 9452, 4851, 1072, 5061,
                          28810, 28107, 17514, 9705, 5933, 3413, 1508, 345, 143, 1010,
                          1416, 756, 27348, 16801, 10598, 7296, 16074, 22152, 17580, 14459,
                          13382, 16454, 23690, 28282, 29818, 25846, 21218, 18697, 14273,
                          11097, 7367, 3956, 1277, 20213, 24440, 11512, 2374, 7557, 29644,
                          18044, 10996, 5940, 1049, 6214, 15040, 21860, 22515, 10279, 1821,
                          12585, 22151, 10944, 1949, 113, 1276, 2715, 1084, 27205, 15449,
                          6853, 1345, 14355, 3456, 12876, 22207, 24569, 16455, 8843, 1783,
                          3003, 11178, 16928, 19507, 17866, 14252, 8653, 2575, 27272, 28168,
                          24155, 21661, 19386, 17905, 17303, 16945, 14177, 11176, 8874,
                          11342, 29516, 29101, 29500, 29912, 25435, 22122, 19069, 17764,
                          20793, 23853, 23790, 19065, 11404, 5819, 1355, 5465, 2129, 9099,
                          7166, 2720, 529, 1454, 963, 25554, 29823, 23449, 15664, 7408,
                          578, 95, 798, 11649, 26701, 22758, 12340, 2399, 8439, 17052,
                          4638, 266, 725, 3201, 1118, 15934, 27251, 23082, 6152, 27251,
                          20682, 17063, 16983, 17958, 29076, 20145, 12972, 8942, 15562,
                          25763, 18354, 7605, 1039, 6949, 14581, 10381, 6186, 3916, 8119,
                          17504, 24818, 27305, 29436, 24894, 17502, 8942, 16309, 8063,
                          29939, 24013, 5541, 29343, 22546, 15012, 7778, 1865, 3550, 16354,
                          18747, 18045, 13137, 8117, 8235, 14176, 17060, 20161, 24604,
                          26584, 25395, 22419, 22404, 25426, 26008, 24484, 22386, 20630,
                          17129, 12151, 5923, 603), Y = c(35, 35, 34.9, 34.8, 34.7, 34.5,
                          34.3, 34.3, 34.3, 34.5, 34.5, 34.3, 34.2, 34, 33.5, 33.5, 34.4,
                          34.2, 33.7, 33.6, 34.3, 34.9, 34.6, 34.1, 33.7, 33.5, 35, 34.8,
                          34.7, 34.7, 34.9, 34.8, 34.6, 34.4, 34.2, 33.8, 34.9, 34.9, 35,
                          34.8, 34.5, 34.3, 33.9, 33.5, 34.8, 34.8, 34.8, 34.6, 34.6, 34.5,
                          34.4, 34.4, 34.1, 33.8, 33.7, 35, 35, 34.7, 34.4, 34.2, 34.1,
                          33.9, 33.6, 33.6, 33.8, 33.9, 33.7, 34.9, 34.7, 34.4, 34.3, 34.6,
                          34.8, 34.7, 34.6, 34.5, 34.6, 34.8, 35, 35, 34.9, 34.8, 34.7,
                          34.6, 34.5, 34.3, 34.1, 33.9, 34.4, 34.9, 34.5, 34, 33.7, 35,
                          34.7, 34.5, 34.2, 33.7, 33.7, 34.6, 34.8, 34.8, 34.4, 33.7, 34.2,
                          34.8, 34.5, 33.8, 33.6, 33.7, 34, 33.8, 34.9, 34.6, 34.3, 33.8,
                          34.6, 34, 34.6, 34.9, 34.9, 34.7, 34.4, 33.9, 33.9, 34.4, 34.7,
                          34.7, 34.7, 34.6, 34.4, 34, 34.9, 35, 34.9, 34.8, 34.7, 34.7,
                          34.7, 34.7, 34.6, 34.5, 34.4, 34.4, 35, 35, 35, 35, 34.9, 34.8,
                          34.7, 34.7, 34.8, 34.9, 34.9, 34.7, 34.5, 34.2, 33.8, 34.1, 33.7,
                          34.4, 34.3, 34, 33.7, 33.9, 33.8, 34.9, 35, 34.9, 34.6, 34.3,
                          33.5, 33.5, 33.8, 34.4, 34.9, 34.8, 34.5, 33.7, 33.8, 34.7, 34.1,
                          33.6, 33.7, 34, 33.7, 32.9, 33.5, 33.4, 32.8, 33.5, 33.3, 33.2,
                          33.2, 33.2, 33.5, 33.3, 33.1, 32.9, 33.2, 33.4, 33.3, 32.9, 32.3,
                          32.6, 33.1, 33, 32.8, 32.6, 32.9, 33.2, 33.4, 33.5, 33.5, 33.4,
                          33.2, 32.9, 33.2, 32.9, 33.3, 33.4, 32.7, 33.5, 33.4, 33.2, 32.9,
                          32.5, 32.5, 33.2, 33.3, 33.2, 33.1, 32.9, 32.9, 33.1, 33.2, 33.3,
                          33.4, 33.5, 33.4, 33.4, 33.4, 33.4, 33.5, 33.4, 33.4, 33.3, 33.2,
                          33.1, 32.8, 32.2)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
                                                       -252L))

plot1 <- ggplot(DF, aes(x = X, y = Y)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm") +
  scale_x_continuous(labels = scales::comma)
plot1

You can group the points by eye in this case:

ggplot(DF, aes(x = X, y = Y, 
               colour=ifelse((Y >= 33.5 & X < 15000) | Y > 33.5 , "high", "low"))) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm") +
  scale_x_continuous(labels = scales::comma) +
  labs(colour="")

Rplot06

4 Likes

You could first do a clustering of the points using kmeans - in this case using simply Y as a feature and knowing that you have k=2 clusters should be straightforward but the clustering is poor in fact.

ADD ON: Again, using eyeballing, you can create a feature like DF$f <– (33 + (1/25000) * DF$X) < DF$Y and cluster on that - works quite well, but it is not really an "unsupervised" way of learning since we have basically found the boundary by eye.

library(ggplot2)

DF <- structure(list(X = c(28929, 28058, 26889, 22683, 17083, 11682,
													 8389, 6937, 8657, 12391, 10983, 8167, 5446, 2361, 313, 4833,
													 10012, 5919, 1343, 112, 13061, 23623, 13712, 4221, 670, 8843,
													 29054, 23020, 18785, 20067, 26936, 21327, 16158, 10539, 5303,
													 1436, 26348, 27128, 27479, 20633, 12875, 6868, 2324, 6169, 21883,
													 19960, 20053, 16112, 13644, 12150, 10765, 9452, 4851, 1072, 5061,
													 28810, 28107, 17514, 9705, 5933, 3413, 1508, 345, 143, 1010,
													 1416, 756, 27348, 16801, 10598, 7296, 16074, 22152, 17580, 14459,
													 13382, 16454, 23690, 28282, 29818, 25846, 21218, 18697, 14273,
													 11097, 7367, 3956, 1277, 20213, 24440, 11512, 2374, 7557, 29644,
													 18044, 10996, 5940, 1049, 6214, 15040, 21860, 22515, 10279, 1821,
													 12585, 22151, 10944, 1949, 113, 1276, 2715, 1084, 27205, 15449,
													 6853, 1345, 14355, 3456, 12876, 22207, 24569, 16455, 8843, 1783,
													 3003, 11178, 16928, 19507, 17866, 14252, 8653, 2575, 27272, 28168,
													 24155, 21661, 19386, 17905, 17303, 16945, 14177, 11176, 8874,
													 11342, 29516, 29101, 29500, 29912, 25435, 22122, 19069, 17764,
													 20793, 23853, 23790, 19065, 11404, 5819, 1355, 5465, 2129, 9099,
													 7166, 2720, 529, 1454, 963, 25554, 29823, 23449, 15664, 7408,
													 578, 95, 798, 11649, 26701, 22758, 12340, 2399, 8439, 17052,
													 4638, 266, 725, 3201, 1118, 15934, 27251, 23082, 6152, 27251,
													 20682, 17063, 16983, 17958, 29076, 20145, 12972, 8942, 15562,
													 25763, 18354, 7605, 1039, 6949, 14581, 10381, 6186, 3916, 8119,
													 17504, 24818, 27305, 29436, 24894, 17502, 8942, 16309, 8063,
													 29939, 24013, 5541, 29343, 22546, 15012, 7778, 1865, 3550, 16354,
													 18747, 18045, 13137, 8117, 8235, 14176, 17060, 20161, 24604,
													 26584, 25395, 22419, 22404, 25426, 26008, 24484, 22386, 20630,
													 17129, 12151, 5923, 603), Y = c(35, 35, 34.9, 34.8, 34.7, 34.5,
													 																34.3, 34.3, 34.3, 34.5, 34.5, 34.3, 34.2, 34, 33.5, 33.5, 34.4,
													 																34.2, 33.7, 33.6, 34.3, 34.9, 34.6, 34.1, 33.7, 33.5, 35, 34.8,
													 																34.7, 34.7, 34.9, 34.8, 34.6, 34.4, 34.2, 33.8, 34.9, 34.9, 35,
													 																34.8, 34.5, 34.3, 33.9, 33.5, 34.8, 34.8, 34.8, 34.6, 34.6, 34.5,
													 																34.4, 34.4, 34.1, 33.8, 33.7, 35, 35, 34.7, 34.4, 34.2, 34.1,
													 																33.9, 33.6, 33.6, 33.8, 33.9, 33.7, 34.9, 34.7, 34.4, 34.3, 34.6,
													 																34.8, 34.7, 34.6, 34.5, 34.6, 34.8, 35, 35, 34.9, 34.8, 34.7,
													 																34.6, 34.5, 34.3, 34.1, 33.9, 34.4, 34.9, 34.5, 34, 33.7, 35,
													 																34.7, 34.5, 34.2, 33.7, 33.7, 34.6, 34.8, 34.8, 34.4, 33.7, 34.2,
													 																34.8, 34.5, 33.8, 33.6, 33.7, 34, 33.8, 34.9, 34.6, 34.3, 33.8,
													 																34.6, 34, 34.6, 34.9, 34.9, 34.7, 34.4, 33.9, 33.9, 34.4, 34.7,
													 																34.7, 34.7, 34.6, 34.4, 34, 34.9, 35, 34.9, 34.8, 34.7, 34.7,
													 																34.7, 34.7, 34.6, 34.5, 34.4, 34.4, 35, 35, 35, 35, 34.9, 34.8,
													 																34.7, 34.7, 34.8, 34.9, 34.9, 34.7, 34.5, 34.2, 33.8, 34.1, 33.7,
													 																34.4, 34.3, 34, 33.7, 33.9, 33.8, 34.9, 35, 34.9, 34.6, 34.3,
													 																33.5, 33.5, 33.8, 34.4, 34.9, 34.8, 34.5, 33.7, 33.8, 34.7, 34.1,
													 																33.6, 33.7, 34, 33.7, 32.9, 33.5, 33.4, 32.8, 33.5, 33.3, 33.2,
													 																33.2, 33.2, 33.5, 33.3, 33.1, 32.9, 33.2, 33.4, 33.3, 32.9, 32.3,
													 																32.6, 33.1, 33, 32.8, 32.6, 32.9, 33.2, 33.4, 33.5, 33.5, 33.4,
													 																33.2, 32.9, 33.2, 32.9, 33.3, 33.4, 32.7, 33.5, 33.4, 33.2, 32.9,
													 																32.5, 32.5, 33.2, 33.3, 33.2, 33.1, 32.9, 32.9, 33.1, 33.2, 33.3,
													 																33.4, 33.5, 33.4, 33.4, 33.4, 33.4, 33.5, 33.4, 33.4, 33.3, 33.2,
													 																33.1, 32.8, 32.2)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
													 																																																						-252L)) 

km <- DF %>% 
	select(Y) %>% 
	kmeans(centers = 2)

DF$cluster <- as.factor(km$cluster)

DF %>% 
	ggplot(aes(x = X, y = Y, colour=cluster)) +
	geom_point(alpha = 0.6) +
	geom_smooth(method = "lm") +
	scale_x_continuous(labels = scales::comma) +
	labs(colour="")

But since @joels has found the cluster regions by eye, we can cheat and get almost as good results:

library(ggplot2)

DF <- structure(list(X = c(28929, 28058, 26889, 22683, 17083, 11682,
													 8389, 6937, 8657, 12391, 10983, 8167, 5446, 2361, 313, 4833,
													 10012, 5919, 1343, 112, 13061, 23623, 13712, 4221, 670, 8843,
													 29054, 23020, 18785, 20067, 26936, 21327, 16158, 10539, 5303,
													 1436, 26348, 27128, 27479, 20633, 12875, 6868, 2324, 6169, 21883,
													 19960, 20053, 16112, 13644, 12150, 10765, 9452, 4851, 1072, 5061,
													 28810, 28107, 17514, 9705, 5933, 3413, 1508, 345, 143, 1010,
													 1416, 756, 27348, 16801, 10598, 7296, 16074, 22152, 17580, 14459,
													 13382, 16454, 23690, 28282, 29818, 25846, 21218, 18697, 14273,
													 11097, 7367, 3956, 1277, 20213, 24440, 11512, 2374, 7557, 29644,
													 18044, 10996, 5940, 1049, 6214, 15040, 21860, 22515, 10279, 1821,
													 12585, 22151, 10944, 1949, 113, 1276, 2715, 1084, 27205, 15449,
													 6853, 1345, 14355, 3456, 12876, 22207, 24569, 16455, 8843, 1783,
													 3003, 11178, 16928, 19507, 17866, 14252, 8653, 2575, 27272, 28168,
													 24155, 21661, 19386, 17905, 17303, 16945, 14177, 11176, 8874,
													 11342, 29516, 29101, 29500, 29912, 25435, 22122, 19069, 17764,
													 20793, 23853, 23790, 19065, 11404, 5819, 1355, 5465, 2129, 9099,
													 7166, 2720, 529, 1454, 963, 25554, 29823, 23449, 15664, 7408,
													 578, 95, 798, 11649, 26701, 22758, 12340, 2399, 8439, 17052,
													 4638, 266, 725, 3201, 1118, 15934, 27251, 23082, 6152, 27251,
													 20682, 17063, 16983, 17958, 29076, 20145, 12972, 8942, 15562,
													 25763, 18354, 7605, 1039, 6949, 14581, 10381, 6186, 3916, 8119,
													 17504, 24818, 27305, 29436, 24894, 17502, 8942, 16309, 8063,
													 29939, 24013, 5541, 29343, 22546, 15012, 7778, 1865, 3550, 16354,
													 18747, 18045, 13137, 8117, 8235, 14176, 17060, 20161, 24604,
													 26584, 25395, 22419, 22404, 25426, 26008, 24484, 22386, 20630,
													 17129, 12151, 5923, 603), Y = c(35, 35, 34.9, 34.8, 34.7, 34.5,
													 																34.3, 34.3, 34.3, 34.5, 34.5, 34.3, 34.2, 34, 33.5, 33.5, 34.4,
													 																34.2, 33.7, 33.6, 34.3, 34.9, 34.6, 34.1, 33.7, 33.5, 35, 34.8,
													 																34.7, 34.7, 34.9, 34.8, 34.6, 34.4, 34.2, 33.8, 34.9, 34.9, 35,
													 																34.8, 34.5, 34.3, 33.9, 33.5, 34.8, 34.8, 34.8, 34.6, 34.6, 34.5,
													 																34.4, 34.4, 34.1, 33.8, 33.7, 35, 35, 34.7, 34.4, 34.2, 34.1,
													 																33.9, 33.6, 33.6, 33.8, 33.9, 33.7, 34.9, 34.7, 34.4, 34.3, 34.6,
													 																34.8, 34.7, 34.6, 34.5, 34.6, 34.8, 35, 35, 34.9, 34.8, 34.7,
													 																34.6, 34.5, 34.3, 34.1, 33.9, 34.4, 34.9, 34.5, 34, 33.7, 35,
													 																34.7, 34.5, 34.2, 33.7, 33.7, 34.6, 34.8, 34.8, 34.4, 33.7, 34.2,
													 																34.8, 34.5, 33.8, 33.6, 33.7, 34, 33.8, 34.9, 34.6, 34.3, 33.8,
													 																34.6, 34, 34.6, 34.9, 34.9, 34.7, 34.4, 33.9, 33.9, 34.4, 34.7,
													 																34.7, 34.7, 34.6, 34.4, 34, 34.9, 35, 34.9, 34.8, 34.7, 34.7,
													 																34.7, 34.7, 34.6, 34.5, 34.4, 34.4, 35, 35, 35, 35, 34.9, 34.8,
													 																34.7, 34.7, 34.8, 34.9, 34.9, 34.7, 34.5, 34.2, 33.8, 34.1, 33.7,
													 																34.4, 34.3, 34, 33.7, 33.9, 33.8, 34.9, 35, 34.9, 34.6, 34.3,
													 																33.5, 33.5, 33.8, 34.4, 34.9, 34.8, 34.5, 33.7, 33.8, 34.7, 34.1,
													 																33.6, 33.7, 34, 33.7, 32.9, 33.5, 33.4, 32.8, 33.5, 33.3, 33.2,
													 																33.2, 33.2, 33.5, 33.3, 33.1, 32.9, 33.2, 33.4, 33.3, 32.9, 32.3,
													 																32.6, 33.1, 33, 32.8, 32.6, 32.9, 33.2, 33.4, 33.5, 33.5, 33.4,
													 																33.2, 32.9, 33.2, 32.9, 33.3, 33.4, 32.7, 33.5, 33.4, 33.2, 32.9,
													 																32.5, 32.5, 33.2, 33.3, 33.2, 33.1, 32.9, 32.9, 33.1, 33.2, 33.3,
													 																33.4, 33.5, 33.4, 33.4, 33.4, 33.4, 33.5, 33.4, 33.4, 33.3, 33.2,
													 																33.1, 32.8, 32.2)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
													 																																																						-252L)) %>% 
	mutate(XX = X<15000, YY = Y>= 33.5)

km <- DF %>% 
	select(Y) %>% 
	kmeans(centers = 2)

DF$cluster <- as.factor(km$cluster)

DF %>% 
	ggplot(aes(x = X, y = Y, colour=cluster)) +
	geom_point(alpha = 0.6) +
	geom_smooth(method = "lm") +
	scale_x_continuous(labels = scales::comma) +
	labs(colour="")

2 Likes

Thank you so much @joels and @valeri !!!

This is a beautiful example for something that appears to be easy to the eye but is more complex and difficult to implement!
I'd like to follow the clustering idea a bit more.
The NCSS- Software has something called "Regression Clustering" that would be perfect for this case - however I didn't find an implementation for R, not sure if this is the correct term used there. https://ncss-wpengine.netdna-ssl.com/wp-content/themes/ncss/pdf/Procedures/NCSS/Regression_Clustering.pdf

Back to other strategies. First we need to scale the data otherwise the X-dimension will always be weighed more than the Y-dimension, but we want to have treat both equal or even want it the other way around, putting higher emphasis on the Y-dimension. Dividing the X by 20.000 may help

> DF2 =mutate(DF, X_2 = X /20000)
> max(DF2$X_2) - min(DF2$X_2)
[1] 1.4922
> max(DF2$Y) - min(DF2$Y)
[1] 2.8

Still K-Means doesn't work good enough!
Agglomerative clustering may work, here all the components are individual clusters in the beginning and then merged with neighbour clusters, this may "walk" along the pattern, separate the 2 conditions.

## agglomerative Clustering (AGNES)
library(cluster)
cluster_agnes = agnes(select(DF2, X_2,Y), diss = FALSE)

# cut the tree and add cluster information to dataframe
DF2$cluster = cutree(cluster_agnes, k = 2)

ggplot(DF2, aes(x = X, y = Y, colour = as.factor(cluster))) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm") +
  scale_x_continuous(labels = scales::comma)

Actually this works!
grafik

PS: It also works with hierarchical clustering, but only when the dissimilarity matrix is calculated using "maximum" for the method.

# calculate dissimilarity matrix
d <- dist(select(DF2, X_2,Y), method = "maximum")
# hierarchical clustering
cluster_hclust = hclust(d)
# cut the tree and assign the results
groups <- cutree(cluster_hclust, k = 2)
DF2$cluster = groups

ggplot(DF2, aes(x = X, y = Y, colour = as.factor(cluster))) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm") +
  scale_x_continuous(labels = scales::comma)

grafik

5 Likes

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.