Representing training data

user124578 · July 3, 2020, 12:29am

I am trying to find the best way to represent my training data. I would like to understand the difference between the predication and true value from one batch to another. This only a sample of the data. The full data contains 1000 records. What is the best graphical representation to use for this please?

data.frame(
    Batch.No = c(245L,246L,246L,247L,247L,248L,248L,
                 249L,249L,250L,250L),
     Image.A = c(484,473.4394226,306,480.7792358,557,
                 578.9783936,613,460.1863403,612,408.3404541,315),
     Image.B = c(431,413.6788025,812,509.5707397,484,
                 405.3319702,304,445.0266418,507,570.5655518,683),
     Image.C = c(802,471.8282776,479,540.9875488,598,
                 447.3132935,586,472.5936584,295,432.6842041,396),
     Image.D = c(467,473.2600708,479,410.3531189,451,
                 410.8186035,516,419.1672668,241,474.9749451,722),
     Image.E = c(430,421.0501709,488,417.0779724,571,
                 465.0914917,710,519.2244263,433,423.9597473,552),
     Image.F = c(598,419.3078613,34,437.076416,447,
                 469.7338257,553,519.0509033,661,449.4110718,505),
     Image.G = c(396,444.025116,619,460.428833,802,
                 434.075592,507,462.6940918,263,485.8326111,276),
     Image.H = c(571,452.5244141,411,397.3113708,429,
                 506.5319214,343,433.348938,34,423.3520813,376),
  Value.Type = as.factor(c("Actual","Pred",
                           "Actual","Pred","Actual","Pred","Actual","Pred",
                           "Actual","Pred","Actual"))
)data.frame(
     Batch.No = c(495L,496L,496L,497L,497L,498L,498L,
                  499L,499L,500L,500L),
      Image.A = c(263,440.2657776,505,391.7754822,304,
                  378.2554321,229,407.1863403,619,365.4620361,315),
      Image.B = c(229,427.150177,34,406.7923279,571,
                  378.7658997,429,432.6651611,479,396.3605042,507),
      Image.C = c(375,422.8170776,389,402.9620361,812,
                  377.5062561,429,381.6202393,812,419.3835144,722),
      Image.D = c(160,429.7803345,488,403.2651367,451,
                  442.1589355,467,456.796814,661,418.9455566,295),
      Image.E = c(468,469.8259583,468,405.3244629,396,
                  452.3336792,557,488.3843689,598,428.8172607,276),
      Image.F = c(447,448.9035339,431,484.988739,433,
                  390.5343933,396,459.8133545,513,386.892334,571),
      Image.G = c(602,557.2020874,683,404.8573608,375,
                  400.1952515,467,388.64151,467,409.2952271,263),
      Image.H = c(431,468.7738342,710,502.2818604,484,
                  466.7520752,513,400.0844116,225,373.730011,880),
   Value.Type = as.factor(c("Actual","Pred",
                            "Actual","Pred","Actual","Pred","Actual","Pred",
                            "Actual","Pred","Actual"))
 )

nirgrahamuk · July 3, 2020, 7:11am

It seems like your predictions and actuals are different orders of magnitude, which implies that the attempt to predict the actual value was a failure...

user124578 · July 3, 2020, 8:09am

That's right! This is the start of the training and the values do improve but it keeps varying relatively so i would like to understand if there is a pattern to this and if the model is not preforming as it should and i cannot understand this by just looking 1000+ of rows... Thanks!

nirgrahamuk · July 3, 2020, 8:13am

could you share more data ?
perhaps

library(dplyr)
datapasta::df_paste(slice(data, 490:500)[, c('Batch','Image1.pred', 'Image1.actual','Image2.pred','Image2.actual','Image3.pred','Image3.actual')])
datapasta::df_paste(slice(data, 990:1000)[, c('Batch','Image1.pred', 'Image1.actual','Image2.pred','Image2.actual','Image3.pred','Image3.actual')])

user124578 · July 3, 2020, 10:09am

Thanks! I've updated the post with more data. Thanks

nirgrahamuk · July 3, 2020, 10:45am

I'd say probably calculate difference metric between a prediction and the actuals and start with simple scatter plotting.

library(tidyverse)

df1<-data.frame(
  Batch.No = c(245L,246L,246L,247L,247L,248L,248L,
               249L,249L,250L,250L),
  Image.A = c(484,473.4394226,306,480.7792358,557,
              578.9783936,613,460.1863403,612,408.3404541,315),
  Image.B = c(431,413.6788025,812,509.5707397,484,
              405.3319702,304,445.0266418,507,570.5655518,683),
  Image.C = c(802,471.8282776,479,540.9875488,598,
              447.3132935,586,472.5936584,295,432.6842041,396),
  Image.D = c(467,473.2600708,479,410.3531189,451,
              410.8186035,516,419.1672668,241,474.9749451,722),
  Image.E = c(430,421.0501709,488,417.0779724,571,
              465.0914917,710,519.2244263,433,423.9597473,552),
  Image.F = c(598,419.3078613,34,437.076416,447,
              469.7338257,553,519.0509033,661,449.4110718,505),
  Image.G = c(396,444.025116,619,460.428833,802,
              434.075592,507,462.6940918,263,485.8326111,276),
  Image.H = c(571,452.5244141,411,397.3113708,429,
              506.5319214,343,433.348938,34,423.3520813,376),
  Value.Type = as.factor(c("Actual","Pred",
                           "Actual","Pred","Actual","Pred","Actual","Pred",
                           "Actual","Pred","Actual"))
)


df2<-data.frame(
  Batch.No = c(495L,496L,496L,497L,497L,498L,498L,
               499L,499L,500L,500L),
  Image.A = c(263,440.2657776,505,391.7754822,304,
              378.2554321,229,407.1863403,619,365.4620361,315),
  Image.B = c(229,427.150177,34,406.7923279,571,
              378.7658997,429,432.6651611,479,396.3605042,507),
  Image.C = c(375,422.8170776,389,402.9620361,812,
              377.5062561,429,381.6202393,812,419.3835144,722),
  Image.D = c(160,429.7803345,488,403.2651367,451,
              442.1589355,467,456.796814,661,418.9455566,295),
  Image.E = c(468,469.8259583,468,405.3244629,396,
              452.3336792,557,488.3843689,598,428.8172607,276),
  Image.F = c(447,448.9035339,431,484.988739,433,
              390.5343933,396,459.8133545,513,386.892334,571),
  Image.G = c(602,557.2020874,683,404.8573608,375,
              400.1952515,467,388.64151,467,409.2952271,263),
  Image.H = c(431,468.7738342,710,502.2818604,484,
              466.7520752,513,400.0844116,225,373.730011,880),
  Value.Type = as.factor(c("Actual","Pred",
                           "Actual","Pred","Actual","Pred","Actual","Pred",
                           "Actual","Pred","Actual")))
  
  df <- union_all(df1,df2) %>% rename(
    batchno=`Batch.No`
  )
  
  df_a <- filter(df,
                 Value.Type=="Actual")
  df_p <- filter(df,
                 Value.Type=="Pred")

  dfm <- merge(df_a,df_p,by="batchno")
  
  cleandiff <- function(varname){
    xname <- paste0(varname,".x")
    yname <- paste0(varname,".y")
    #(prediction - actual )/ actual for a percentage error metric
    # dfm[[varname]] <<- (dfm[[yname]] -  dfm[[xname]]) /  dfm[[xname]]
    #(prediction - actual )  for an absolute difference error metric
    dfm[[varname]] <<- (dfm[[yname]] -  dfm[[xname]]) 
  }
  cleandiff("Image.A")
  cleandiff("Image.B")
  cleandiff("Image.C")
  cleandiff("Image.D")
  cleandiff("Image.E")
  cleandiff("Image.F")
  cleandiff("Image.G")
  cleandiff("Image.H")
  
  dfnew <- select(dfm,
                  batchno,
                  Image.A,
                  Image.B,
                  Image.C,
                  Image.D,
                  Image.E,
                  Image.F,
                  Image.G,
                  Image.H
  )
  dflonger <- pivot_longer(dfnew,
                           cols=c(Image.A,
                                  Image.B,
                                  Image.C,
                                  Image.D,
                                  Image.E,
                                  Image.F,
                                  Image.G,
                                  Image.H
                                  ),
                           names_to = "Image",
                           values_to = "value")
  
  ggplot(data=dflonger,
         mapping = aes(x=batchno,
                       y=value,
                       color=Image)) + 
    geom_point()  #+   
  #geom_smooth(method = "lm",alpha=0.1)

but lacking a lot of context about what you are doing

user124578 · July 3, 2020, 10:51am

Thanks! Sorry. I am trying to predict a number of cells in an image using machine learning. I think i was looking for a graph which shows a distribution difference between the predictions and the actual over the iteration. Does that make sense?

system · July 10, 2020, 10:51am

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.