We will explore the process of embedding a linear regression line within a scatter plot using R. For illustration, we will use the well-known Iris dataset, which is readily available in R. The Iris dataset contains measurements of sepal length, sepal width, petal length, and petal width for three different species of iris flowers: Setosa, Versicolor, and Virginica. By visualizing the relationship between two numerical variables through a scatter plot, we can better understand the correlation between them. To enhance this visualization, we will overlay a linear regression line, which helps in identifying trends and patterns in the data. We will implement this using the base R and ggplot2 package in R, which provides a powerful and flexible approach for creating high-quality visualizations.
data(iris)
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
We will consider the scatter diagram connecting sepal length and sepal width.
plot(iris$Sepal.Length,iris$Sepal.Width, xlab="Sepal Length", ylab="Sepal Width",main="Sepal Length against Sepal Width")
Now to embed the regression line into the scatter diagram, we first find the regression line connecting these points and plot suing abline() function avalable in base r.
model=lm(iris$Sepal.Width~iris$Sepal.Length)
plot(iris$Sepal.Length,iris$Sepal.Width, xlab="Sepal Length", ylab="Sepal Width",main="Sepal Length against Sepal Width",col='blue')
abline(model,col='red')
Note that while plotting using base r, make sure that there is agreement between dependent variable given in plot (given along x axis) and in linear model (given after ~ symbol).
Now we shall consider the above plot using ggplot2.
library(ggplot2)
ggplot(iris,aes(x=Sepal.Length,y=Sepal.Width))+geom_point()+labs(title = 'Sepal Length against Sepal Width',x='Sepal Length',y='Sepal Width')
We can format the background of the plot using theme_minimal() along with ggplot. Advantages of using theme_minimal() are
ggplot(iris,aes(x=Sepal.Length,y=Sepal.Width))+geom_point()+labs(title='Sepal Length against Sepal Width',x='Sepal Length',y='Sepal Width')+theme_minimal()
Now we will embed the linear regression line into the plot using ggplot.
ggplot(iris,aes(x=Sepal.Length,y=Sepal.Width))+geom_point()+geom_smooth(method='lm')+labs(title='Sepal Length against Sepal Width',x='Sepal Length',y='Sepal Width')+theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
If we do not want to show the confidence interval shown in the shadow, using the parameter se=FALSE in geom_smooth.
ggplot(iris,aes(x=Sepal.Length,y=Sepal.Width))+geom_point()+geom_smooth(method='lm',se=FALSE)+labs(title='Sepal Length against Sepal Width',x='Sepal Length',y='Sepal Width')+theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
Now we shall consider separate colours for each species in the scatter diagram.
ggplot(iris,aes(x=Sepal.Length,y=Sepal.Width,color=Species))+geom_point()+labs(title='Sepal Length against Sepal Width',x='Sepal Length',y='Sepal Width')+theme_minimal()
We shall format the position of the legend using theme() function. Note that theme() should be applied after theme_minimal(). Also in centalizing the title of the plot is also done using theme().
ggplot(iris,aes(x=Sepal.Length,y=Sepal.Width,color=Species))+geom_point()+labs(title='Sepal Length against Sepal Width',x='Sepal Length',y='Sepal Width')+theme_minimal()+theme(legend.position = "bottom", legend.direction = "horizontal",plot.title = element_text(hjust = 0.5))
In the previous section we had seen the case where each species is coloured with default colours. Now lets consider the case we are assigning custom colours to each species. Yellow to Setosa, Green to Virginica, cyan to Versicolor.
custom_colors=c("setosa"='yellow',"virginica"='green','versicolor'='cyan')
library(ggplot2)
ggplot(iris,aes(x=Sepal.Length,y=Sepal.Width,color=Species))+geom_point()+labs(title="Sepal Length against Sepal Width",x='Sepal Length',y='Sepal Width')+scale_color_manual(values=custom_colors)+theme_minimal()
ggplot(iris,aes(x=Sepal.Length,y=Sepal.Width,color=Species))+geom_point()+geom_smooth(method='lm',se=FALSE)+labs(title='Sepal Length against Sepal Width',x='Sepal Length',y='Sepal Width')+theme_minimal()+theme(legend.position = "bottom", legend.direction = "horizontal",plot.title = element_text(hjust = 0.5))
## `geom_smooth()` using formula = 'y ~ x'
Now we shall assign cutom colours to each species.
custom_colors=c("setosa"='yellow',"virginica"='green','versicolor'='cyan')
library(ggplot2)
ggplot(iris,aes(x=Sepal.Length,y=Sepal.Width,color=Species))+geom_point()+labs(title="Sepal Length against Sepal Width",x='Sepal Length',y='Sepal Width')+scale_color_manual(values=custom_colors)+geom_smooth(method='lm',se=FALSE)+theme_minimal()+theme(plot.title = element_text(hjust = 0.5),legend.position = 'bottom',legend.direction = 'horizontal')
## `geom_smooth()` using formula = 'y ~ x'
#### Shape of the points varies with Species
ggplot(iris,aes(x=Sepal.Length,y=Sepal.Width,color=Species,shape=Species))+geom_point()
custom_shape=c('setosa'=18,'virginica'=22,'versicolor'=25)
ggplot(iris,aes(x=Sepal.Length,y=Sepal.Width,color=Species,shape=Species))+geom_point()+scale_color_manual(values=custom_colors)+scale_shape_manual(values=custom_shape)+theme_minimal()
# Load necessary library
library(ggplot2)
data(iris)
# Create scatter plot
iris$id=as.numeric(row.names(iris))
ggplot(iris, aes(x = Sepal.Length, y = Sepal.Width,color=Species, label = id)) +
geom_point(size = 3) + # Scatter points
geom_text(vjust = -0.75, size = 2) + # Labels for points
labs(
title = "Scatter Plot of Sepal Length and Sepal width",
x = "Sepal Length in cm",
y = "Sepal Width in cm"
) +
theme_minimal()+theme(legend.position = 'bottom',legend.direction = 'horizontal',plot.title = element_text(hjust = 0.5))