The data I used for this assignment I pulled from dryad: Coverdale, Tyler; Agrawal, Anurag (2022). Data from: Experimental insect suppression causes loss of induced, but not constitutive, resistance in Solanum carolinense [Dataset]. Dryad.

Following the code given in the first part of the homework:

library(ggplot2) # for graphics
library(MASS) # for maximum likelihood estimation

# quick and dirty, a truncated normal distribution to work on the solution set

z <- rnorm(n=3000,mean=0.2)
z <- data.frame(1:3000,z)
names(z) <- list("ID","myVar")
z <- z[z$myVar>0,]
str(z)
## 'data.frame':    1703 obs. of  2 variables:
##  $ ID   : int  3 4 5 7 8 10 11 14 15 21 ...
##  $ myVar: num  0.0808 0.2157 0.2746 0.9792 0.4333 ...
summary(z$myVar)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000774 0.390832 0.741123 0.870920 1.219252 3.936159
#plotting histogram of data
p1 <- ggplot(data=z, aes(x=myVar, y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) 
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## â„ą Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(p1)
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## â„ą Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#adding empirical density curve
p1 <-  p1 +  geom_density(linetype="dotted",size=0.75)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#getting maximum likelihood parameters for normal
normPars <- fitdistr(z$myVar,"normal")
print(normPars)
##       mean          sd    
##   0.87091977   0.62027365 
##  (0.01503059) (0.01062823)
str(normPars)
## List of 5
##  $ estimate: Named num [1:2] 0.871 0.62
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ sd      : Named num [1:2] 0.015 0.0106
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ vcov    : num [1:2, 1:2] 0.000226 0 0 0.000113
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "mean" "sd"
##   .. ..$ : chr [1:2] "mean" "sd"
##  $ n       : int 1703
##  $ loglik  : num -1603
##  - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"] # note structure of getting a named attribute
##      mean 
## 0.8709198
#plotting normal probability density
meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]

xval <- seq(0,max(z$myVar),len=length(z$myVar))

 stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(z$myVar), args = list(mean = meanML, sd = sdML))
 p1 + stat
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#plotting exponential probability density
expoPars <- fitdistr(z$myVar,"exponential")
rateML <- expoPars$estimate["rate"]

stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue", n = length(z$myVar), args = list(rate=rateML))
 p1 + stat + stat2 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#plotting uniform probability density
stat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(z$myVar), args = list(min=min(z$myVar), max=max(z$myVar)))
 p1 + stat + stat2 + stat3
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#plotting gamma probability density
gammaPars <- fitdistr(z$myVar,"gamma")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]

stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$myVar), args = list(shape=shapeML, rate=rateML))
 p1 + stat + stat2 + stat3 + stat4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#plotting beta probability density
pSpecial <- ggplot(data=z, aes(x=myVar/(max(myVar + 0.1)), y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) + 
  xlim(c(0,1)) +
  geom_density(size=0.75,linetype="dotted")

betaPars <- fitdistr(x=z$myVar/max(z$myVar + 0.1),start=list(shape1=1,shape2=2),"beta")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]

statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(z$myVar), args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (`geom_bar()`).

And with data from dryad

z <- read.csv("Supplemental_Data_2.csv",header=TRUE,sep=",")
str(z)
## 'data.frame':    15 obs. of  9 variables:
##  $ Plot..                      : int  1 2 4 5 6 7 8 9 10 11 ...
##  $ Insect.suppression.treatment: chr  "Insect Suppression" "Control" "Insect Suppression" "Control" ...
##  $ X..horsenettle.surveyed     : int  8 5 8 7 8 7 8 8 8 2 ...
##  $ Prop..flowering             : num  0.25 0 0.38 0.14 0.5 0.29 0.13 0.38 0.25 0.5 ...
##  $ Mean.height..cm.            : num  21.2 23.6 22.5 22 26.6 ...
##  $ Mean...leaves               : num  5.38 4.8 5.5 5.29 5.25 6.29 4.25 5 4.38 6 ...
##  $ Epitrix.damage              : num  0 6.66 26.68 12.5 15.63 ...
##  $ L..juncta.damage            : num  12.5 26 15.53 9.69 19.28 ...
##  $ Mean...prickles             : num  2.43 4.98 1.84 2.2 6.13 6.34 3.89 3.59 5.51 6.65 ...
summary(z)
##      Plot..       Insect.suppression.treatment X..horsenettle.surveyed
##  Min.   : 1.000   Length:15                    Min.   :2.000          
##  1st Qu.: 5.500   Class :character             1st Qu.:7.000          
##  Median : 9.000   Mode  :character             Median :8.000          
##  Mean   : 8.867                                Mean   :7.133          
##  3rd Qu.:12.500                                3rd Qu.:8.000          
##  Max.   :16.000                                Max.   :8.000          
##  Prop..flowering Mean.height..cm. Mean...leaves   Epitrix.damage 
##  Min.   :0.000   Min.   :21.25    Min.   :3.880   Min.   : 0.00  
##  1st Qu.:0.155   1st Qu.:22.62    1st Qu.:4.565   1st Qu.: 9.58  
##  Median :0.290   Median :26.63    Median :5.250   Median :22.50  
##  Mean   :0.300   Mean   :26.40    Mean   :5.155   Mean   :24.78  
##  3rd Qu.:0.440   3rd Qu.:28.75    3rd Qu.:5.440   3rd Qu.:37.90  
##  Max.   :0.500   Max.   :36.88    Max.   :7.130   Max.   :70.41  
##  L..juncta.damage Mean...prickles
##  Min.   : 4.16    Min.   :1.350  
##  1st Qu.:11.67    1st Qu.:3.010  
##  Median :19.04    Median :3.890  
##  Mean   :19.87    Mean   :4.175  
##  3rd Qu.:26.55    3rd Qu.:5.460  
##  Max.   :41.25    Max.   :6.650
p1 <- ggplot(data = z, aes(x = Mean.height..cm.)) +
geom_histogram(aes(fill = Mean...prickles), color = "grey60", bins = 30) +
scale_fill_gradient(low = "cornsilk", high = "cornflowerblue") + # Gradient fill color
labs(x = "Mean Height (cm)", y = "Frequency") 
print(p1)
## Warning: The following aesthetics were dropped during statistical transformation: fill
## â„ą This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## â„ą Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

#plotting histogram of data
p1 <- ggplot(data = z, aes(x = Mean.height..cm.)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2) 
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#adding empirical density curve
p1 <-  p1 +  geom_density(linetype="dotted",size=0.75)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#getting maximum likelihood parameters for normal
normPars <- fitdistr(z$Mean.height..cm.,"normal")
print(normPars)
##       mean          sd    
##   26.3960000    4.0508498 
##  ( 1.0459249) ( 0.7395806)
str(normPars)
## List of 5
##  $ estimate: Named num [1:2] 26.4 4.05
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ sd      : Named num [1:2] 1.05 0.74
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ vcov    : num [1:2, 1:2] 1.094 0 0 0.547
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "mean" "sd"
##   .. ..$ : chr [1:2] "mean" "sd"
##  $ n       : int 15
##  $ loglik  : num -42.3
##  - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"] # note structure of getting a named attribute
##   mean 
## 26.396
#plotting normal probability density
meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]

xval <- seq(0,max(z$Mean.height..cm.),len=length(z$Mean.height..cm.))

stat <- stat_function(aes(x = xval, y = Mean.height..cm.), fun = dnorm, colour="red", n = length(z$Mean.height..cm.), args = list(mean = meanML, sd = sdML))
p1 + stat
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#plotting exponential probability density
expoPars <- fitdistr(z$Mean.height..cm.,"exponential")
rateML <- expoPars$estimate["rate"]

stat2 <- stat_function(aes(x = xval, y = Mean.height..cm.), fun = dexp, colour="blue", n = length(z$Mean.height..cm.), args = list(rate=rateML))
p1 + stat + stat2 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#plotting uniform probability density
stat3 <- stat_function(aes(x = xval, y = Mean.height..cm.), fun = dunif, colour="darkgreen", n = length(z$Mean.height..cm.), args = list(min=min(z$Mean.height..cm.), max=max(z$Mean.height..cm.)))
p1 + stat + stat2 + stat3
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#plotting gamma probability density
gammaPars <- fitdistr(z$Mean.height..cm.,"gamma")
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]

stat4 <- stat_function(aes(x = xval, y = Mean.height..cm.), fun = dgamma, colour="purple", n = length(z$Mean.height..cm.), args = list(shape=shapeML, rate=rateML))
p1 + stat + stat2 + stat3 + stat4 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#plotting beta probability density
pSpecial <- ggplot(data=z, aes(x=Mean.height..cm./(max(Mean.height..cm. + 0.1)), y=Mean.height..cm.)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2) + 
xlim(c(0,1)) +
geom_density(size=0.75,linetype="dotted")

betaPars <- fitdistr(x=z$Mean.height..cm./max(z$Mean.height..cm. + 0.1),start=list(shape1=1,shape2=2),"beta")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]

statSpecial <- stat_function(aes(x = xval, y = Mean.height..cm.), fun = dbeta, colour="orchid", n = length(z$Mean.height..cm.), args = list(shape1=shape1ML,shape2=shape2ML))
#pSpecial + statSpecial

The normal distribution looks like it fits the best, so I will go with that one.

Mean = 26.4 SD = 4.05

Now creating another data set with the parameters from the normal distribution.

y <- rnorm(n=15,mean=26.4,sd=4.05)
y <- data.frame(1:15,y)
names(y) <- list("ID","myVar")
y <- y[y$myVar>0,]
str(y)
## 'data.frame':    15 obs. of  2 variables:
##  $ ID   : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ myVar: num  25.8 24.5 22.5 26.3 27.7 ...
summary(y$myVar)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   22.08   23.61   25.99   26.21   27.58   35.95
#plotting histogram of data
p1normal <- ggplot(data=y, aes(x=myVar, y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) 
print(p1normal)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#adding empirical density curve
p1normal <-  p1normal +  geom_density(linetype="dotted",size=0.75)
print(p1normal)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#getting maximum likelihood parameters for normal
normPars <- fitdistr(y$myVar,"normal")
print(normPars)
##       mean          sd    
##   26.2143018    3.4096665 
##  ( 0.8803721) ( 0.6225171)
str(normPars)
## List of 5
##  $ estimate: Named num [1:2] 26.21 3.41
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ sd      : Named num [1:2] 0.88 0.623
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ vcov    : num [1:2, 1:2] 0.775 0 0 0.388
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "mean" "sd"
##   .. ..$ : chr [1:2] "mean" "sd"
##  $ n       : int 15
##  $ loglik  : num -39.7
##  - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"] # note structure of getting a named attribute
##    mean 
## 26.2143
#plotting normal probability density
meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]

xval <- seq(0,max(y$myVar),len=length(y$myVar))

 stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(y$myVar), args = list(mean = meanML, sd = sdML))
 p1 + stat
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The histogram profiles appear to be similar. I do think that in the future I will use a larger data set. Part of the problem with my lack of comfort with the results is the lack of data, there are very few data points, something I didn’t think about much when I pulled the data. This makes me unsure about the entire thing.