The data I used for this assignment I pulled from dryad: Coverdale, Tyler; Agrawal, Anurag (2022). Data from: Experimental insect suppression causes loss of induced, but not constitutive, resistance in Solanum carolinense [Dataset]. Dryad.
Following the code given in the first part of the homework:
library(ggplot2) # for graphics
library(MASS) # for maximum likelihood estimation
# quick and dirty, a truncated normal distribution to work on the solution set
z <- rnorm(n=3000,mean=0.2)
z <- data.frame(1:3000,z)
names(z) <- list("ID","myVar")
z <- z[z$myVar>0,]
str(z)
## 'data.frame': 1703 obs. of 2 variables:
## $ ID : int 3 4 5 7 8 10 11 14 15 21 ...
## $ myVar: num 0.0808 0.2157 0.2746 0.9792 0.4333 ...
summary(z$myVar)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000774 0.390832 0.741123 0.870920 1.219252 3.936159
#plotting histogram of data
p1 <- ggplot(data=z, aes(x=myVar, y=..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## â„ą Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(p1)
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## â„ą Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#adding empirical density curve
p1 <- p1 + geom_density(linetype="dotted",size=0.75)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#getting maximum likelihood parameters for normal
normPars <- fitdistr(z$myVar,"normal")
print(normPars)
## mean sd
## 0.87091977 0.62027365
## (0.01503059) (0.01062823)
str(normPars)
## List of 5
## $ estimate: Named num [1:2] 0.871 0.62
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ sd : Named num [1:2] 0.015 0.0106
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ vcov : num [1:2, 1:2] 0.000226 0 0 0.000113
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "mean" "sd"
## .. ..$ : chr [1:2] "mean" "sd"
## $ n : int 1703
## $ loglik : num -1603
## - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"] # note structure of getting a named attribute
## mean
## 0.8709198
#plotting normal probability density
meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]
xval <- seq(0,max(z$myVar),len=length(z$myVar))
stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(z$myVar), args = list(mean = meanML, sd = sdML))
p1 + stat
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#plotting exponential probability density
expoPars <- fitdistr(z$myVar,"exponential")
rateML <- expoPars$estimate["rate"]
stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue", n = length(z$myVar), args = list(rate=rateML))
p1 + stat + stat2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#plotting uniform probability density
stat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(z$myVar), args = list(min=min(z$myVar), max=max(z$myVar)))
p1 + stat + stat2 + stat3
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#plotting gamma probability density
gammaPars <- fitdistr(z$myVar,"gamma")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]
stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$myVar), args = list(shape=shapeML, rate=rateML))
p1 + stat + stat2 + stat3 + stat4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#plotting beta probability density
pSpecial <- ggplot(data=z, aes(x=myVar/(max(myVar + 0.1)), y=..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2) +
xlim(c(0,1)) +
geom_density(size=0.75,linetype="dotted")
betaPars <- fitdistr(x=z$myVar/max(z$myVar + 0.1),start=list(shape1=1,shape2=2),"beta")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]
statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(z$myVar), args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (`geom_bar()`).
And with data from dryad
z <- read.csv("Supplemental_Data_2.csv",header=TRUE,sep=",")
str(z)
## 'data.frame': 15 obs. of 9 variables:
## $ Plot.. : int 1 2 4 5 6 7 8 9 10 11 ...
## $ Insect.suppression.treatment: chr "Insect Suppression" "Control" "Insect Suppression" "Control" ...
## $ X..horsenettle.surveyed : int 8 5 8 7 8 7 8 8 8 2 ...
## $ Prop..flowering : num 0.25 0 0.38 0.14 0.5 0.29 0.13 0.38 0.25 0.5 ...
## $ Mean.height..cm. : num 21.2 23.6 22.5 22 26.6 ...
## $ Mean...leaves : num 5.38 4.8 5.5 5.29 5.25 6.29 4.25 5 4.38 6 ...
## $ Epitrix.damage : num 0 6.66 26.68 12.5 15.63 ...
## $ L..juncta.damage : num 12.5 26 15.53 9.69 19.28 ...
## $ Mean...prickles : num 2.43 4.98 1.84 2.2 6.13 6.34 3.89 3.59 5.51 6.65 ...
summary(z)
## Plot.. Insect.suppression.treatment X..horsenettle.surveyed
## Min. : 1.000 Length:15 Min. :2.000
## 1st Qu.: 5.500 Class :character 1st Qu.:7.000
## Median : 9.000 Mode :character Median :8.000
## Mean : 8.867 Mean :7.133
## 3rd Qu.:12.500 3rd Qu.:8.000
## Max. :16.000 Max. :8.000
## Prop..flowering Mean.height..cm. Mean...leaves Epitrix.damage
## Min. :0.000 Min. :21.25 Min. :3.880 Min. : 0.00
## 1st Qu.:0.155 1st Qu.:22.62 1st Qu.:4.565 1st Qu.: 9.58
## Median :0.290 Median :26.63 Median :5.250 Median :22.50
## Mean :0.300 Mean :26.40 Mean :5.155 Mean :24.78
## 3rd Qu.:0.440 3rd Qu.:28.75 3rd Qu.:5.440 3rd Qu.:37.90
## Max. :0.500 Max. :36.88 Max. :7.130 Max. :70.41
## L..juncta.damage Mean...prickles
## Min. : 4.16 Min. :1.350
## 1st Qu.:11.67 1st Qu.:3.010
## Median :19.04 Median :3.890
## Mean :19.87 Mean :4.175
## 3rd Qu.:26.55 3rd Qu.:5.460
## Max. :41.25 Max. :6.650
p1 <- ggplot(data = z, aes(x = Mean.height..cm.)) +
geom_histogram(aes(fill = Mean...prickles), color = "grey60", bins = 30) +
scale_fill_gradient(low = "cornsilk", high = "cornflowerblue") + # Gradient fill color
labs(x = "Mean Height (cm)", y = "Frequency")
print(p1)
## Warning: The following aesthetics were dropped during statistical transformation: fill
## â„ą This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## â„ą Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
#plotting histogram of data
p1 <- ggplot(data = z, aes(x = Mean.height..cm.)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#adding empirical density curve
p1 <- p1 + geom_density(linetype="dotted",size=0.75)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#getting maximum likelihood parameters for normal
normPars <- fitdistr(z$Mean.height..cm.,"normal")
print(normPars)
## mean sd
## 26.3960000 4.0508498
## ( 1.0459249) ( 0.7395806)
str(normPars)
## List of 5
## $ estimate: Named num [1:2] 26.4 4.05
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ sd : Named num [1:2] 1.05 0.74
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ vcov : num [1:2, 1:2] 1.094 0 0 0.547
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "mean" "sd"
## .. ..$ : chr [1:2] "mean" "sd"
## $ n : int 15
## $ loglik : num -42.3
## - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"] # note structure of getting a named attribute
## mean
## 26.396
#plotting normal probability density
meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]
xval <- seq(0,max(z$Mean.height..cm.),len=length(z$Mean.height..cm.))
stat <- stat_function(aes(x = xval, y = Mean.height..cm.), fun = dnorm, colour="red", n = length(z$Mean.height..cm.), args = list(mean = meanML, sd = sdML))
p1 + stat
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#plotting exponential probability density
expoPars <- fitdistr(z$Mean.height..cm.,"exponential")
rateML <- expoPars$estimate["rate"]
stat2 <- stat_function(aes(x = xval, y = Mean.height..cm.), fun = dexp, colour="blue", n = length(z$Mean.height..cm.), args = list(rate=rateML))
p1 + stat + stat2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#plotting uniform probability density
stat3 <- stat_function(aes(x = xval, y = Mean.height..cm.), fun = dunif, colour="darkgreen", n = length(z$Mean.height..cm.), args = list(min=min(z$Mean.height..cm.), max=max(z$Mean.height..cm.)))
p1 + stat + stat2 + stat3
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#plotting gamma probability density
gammaPars <- fitdistr(z$Mean.height..cm.,"gamma")
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]
stat4 <- stat_function(aes(x = xval, y = Mean.height..cm.), fun = dgamma, colour="purple", n = length(z$Mean.height..cm.), args = list(shape=shapeML, rate=rateML))
p1 + stat + stat2 + stat3 + stat4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#plotting beta probability density
pSpecial <- ggplot(data=z, aes(x=Mean.height..cm./(max(Mean.height..cm. + 0.1)), y=Mean.height..cm.)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2) +
xlim(c(0,1)) +
geom_density(size=0.75,linetype="dotted")
betaPars <- fitdistr(x=z$Mean.height..cm./max(z$Mean.height..cm. + 0.1),start=list(shape1=1,shape2=2),"beta")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]
statSpecial <- stat_function(aes(x = xval, y = Mean.height..cm.), fun = dbeta, colour="orchid", n = length(z$Mean.height..cm.), args = list(shape1=shape1ML,shape2=shape2ML))
#pSpecial + statSpecial
The normal distribution looks like it fits the best, so I will go with that one.
Mean = 26.4 SD = 4.05
Now creating another data set with the parameters from the normal distribution.
y <- rnorm(n=15,mean=26.4,sd=4.05)
y <- data.frame(1:15,y)
names(y) <- list("ID","myVar")
y <- y[y$myVar>0,]
str(y)
## 'data.frame': 15 obs. of 2 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ myVar: num 25.8 24.5 22.5 26.3 27.7 ...
summary(y$myVar)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 22.08 23.61 25.99 26.21 27.58 35.95
#plotting histogram of data
p1normal <- ggplot(data=y, aes(x=myVar, y=..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2)
print(p1normal)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#adding empirical density curve
p1normal <- p1normal + geom_density(linetype="dotted",size=0.75)
print(p1normal)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#getting maximum likelihood parameters for normal
normPars <- fitdistr(y$myVar,"normal")
print(normPars)
## mean sd
## 26.2143018 3.4096665
## ( 0.8803721) ( 0.6225171)
str(normPars)
## List of 5
## $ estimate: Named num [1:2] 26.21 3.41
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ sd : Named num [1:2] 0.88 0.623
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ vcov : num [1:2, 1:2] 0.775 0 0 0.388
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "mean" "sd"
## .. ..$ : chr [1:2] "mean" "sd"
## $ n : int 15
## $ loglik : num -39.7
## - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"] # note structure of getting a named attribute
## mean
## 26.2143
#plotting normal probability density
meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]
xval <- seq(0,max(y$myVar),len=length(y$myVar))
stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(y$myVar), args = list(mean = meanML, sd = sdML))
p1 + stat
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The histogram profiles appear to be similar. I do think that in the future I will use a larger data set. Part of the problem with my lack of comfort with the results is the lack of data, there are very few data points, something I didn’t think about much when I pulled the data. This makes me unsure about the entire thing.