# Install ggplot2 and tidyr if not present already requirements <- c("ggplot2", "tidyr") install.packages(setdiff(requirements, rownames(installed.packages()))) # # We'll need tidyr much later but it is good practice to have all packages # loaded at the beginning of the script library(ggplot2) library(tidyr) head(mtcars) ?mtcars # Let's draw a simple histogram ggplot(mtcars, aes(x=mpg))+ geom_histogram(binwidth=5) # In ggplot2, we make heavy use of 'adding layers of visualization to the data' # So in order to add more information, we save this base plot and extend it. # At the end of the script you'll see how you usually write this in your own code # This incremental extension of the plot is just for demonstration p <- ggplot(mtcars, aes(x=mpg))+ geom_histogram(binwidth=5) # Note, that this doesn't show the plot # You'd have to use p # to show it. While keeping a plot in a variable is not very common in every day # visualization practice, it has the great advantage that you can save a plot easily: ggsave(p, filename="test.pdf", width=20, height=15, units="cm", dpi=300) # this creates a test.pdf containing the histogram in your working directory. # Producing another file type is as easy as ggsave(p, filename="test.png", width=20, height=15, units="cm", dpi=300) # Ok, let's continue making our plot more beautiful by adding labels to the axes p <- p + xlab("Miles per Gallon") + ylab("Number of Cars") p p <- p + labs(title = "Distribution of fuel consumption", subtitle = "According to the mtcars dataset", caption = "Created by Mirco, 2022") p p <- p + theme_dark() p p <- p + theme_grey() p p <- p + theme_minimal() p # In case you need to manually adjust details like font size: p <- p + theme(axis.text = element_text(size=16)) p p <- p + theme(axis.text = element_text(size=16), axis.text.x = element_text(face="bold", angle=315, vjust = .3), axis.title = element_text(size=20), plot.caption = element_text(colour="darkgrey")) p # See the help page for theme to find out about the parameters that you can adjust ?theme # # Usually, a ggplot2 command looks like this: ggplot(mtcars, aes(x=mpg))+ geom_histogram(binwidth=5)+ xlab("Miles per Gallon") + ylab("Number of Cars")+ labs(title = "Distribution of fuel consumption", subtitle = "According to the mtcars dataset", caption = "Created by Mirco, 2022")+ theme_minimal()+ theme(axis.text = element_text(size=16), axis.title = element_text(size=20), plot.caption = element_text(colour="darkgrey")) # Pay attention to the order of commands! ggplot(mtcars, aes(x=mpg))+ geom_histogram(binwidth=5)+ xlab("Miles per Gallon") + ylab("Number of Cars")+ labs(title = "Distribution of fuel consumption", subtitle = "According to the mtcars dataset", caption = "Created by Mirco, 2022")+ theme(axis.text = element_text(size=16), axis.title = element_text(size=20), plot.caption = element_text(colour="darkgrey"))+ theme_minimal() # Placing theme_minimal() after the theme()-call overrides all our adjustments! # # # # Other types of plots are as easy! It's just a matter of adding the right layer. # Create Scatterplots ggplot(mtcars,aes(x=wt,y=mpg)) + geom_point()+ xlab('Weight (x 1000lbs)') + ylab('Miles per Gallon') # Add trend lines to your scatterplots ggplot(mtcars,aes(x=wt,y=mpg)) + geom_point()+ geom_smooth()+ xlab('Weight (x 1000lbs)') + ylab('Miles per Gallon') # Add some color to the plot! mtcars_fac <- mtcars # Therefore, we need to convert the cylinder column to a factor # Otherwise ggplot interprets the numbers as being continuous mtcars$cyl sum(mtcars$cyl) class(mtcars$cyl) mtcars_fac$cyl <- as.factor(mtcars$cyl) mtcars_fac$cyl sum(mtcars_fac$cyl) class(mtcars_fac$cyl) # have a look at the difference by replacing mtcars_fac with mtcars ggplot(mtcars_fac,aes(x=wt,y=mpg,col=cyl)) + geom_point()+ xlab('Weight (x 1000lbs)') + ylab('Miles per Gallon') # Create Boxplots ggplot(mtcars_fac,aes(x=cyl,y=mpg,group=cyl)) + geom_boxplot() + xlab('Number of Cylinders') + ylab('Miles per Gallon') # Create Barplots for each row of the dataset mtcars_fac mtcars_fac$idu <- 1:nrow(mtcars_fac) ggplot(mtcars_fac,aes(x=idu,y=hp)) + geom_bar(stat='identity') + xlab(NULL)+ ylab('Horsepower') # # For adjusting the axes: p <- ggplot(mtcars_fac,aes(x=cyl,y=hp)) + geom_point()+ xlab('Number of Cylinders') + ylab('Weight (x 1000lbs)') p p + scale_x_discrete(breaks=c(4,6,8), labels=c("Four", "Six", "Eight")) # # And, very important, adjusting the limits of axis! p <- ggplot(mtcars,aes(x=wt,y=mpg)) + geom_point()+ geom_smooth()+ xlab('Weight (x 1000lbs)') + ylab('Miles per Gallon') # Imagine we want to set the limits on the x axis. # There are two possibilities: using xlim (or ylim), or coord_cartesian # first one: p + xlim(c(2.5,4.5)) # which is equivalent to p + scale_x_continuous(limits=c(2.5,4.5)) # the other method: p + coord_cartesian(xlim=c(2.5,4.5)) # Notice the difference? # coord_cartesian just 'zooms' in on the plot, leaving the data unchanged. # scale_x_continues converts everything outside the limits to NA, CHANGING THE DATA! # # Sometimes, you want to use custom colors. You can do this by specifying values for factor levels: ggplot(mtcars_fac,aes(x=wt,y=mpg,col=cyl)) + geom_point()+ xlab('Weight (x 1000lbs)') + ylab('Miles per Gallon') + scale_color_manual(values = c("4" = "#AA5639", "6" = "#403075", "8" = "#AAA739")) # # In case you want to reorder the plot, you have to reorder the factor levels... ggplot(mtcars_fac,aes(x=cyl,y=hp)) + geom_point()+ xlab('Number of Cylinders') + ylab('Weight (x 1000lbs)') levels(mtcars_fac$cyl) # mtcars_fac$cyl <- factor(mtcars$cyl, levels=c("6","4","8")) # custom order mtcars_fac$cyl <- factor(mtcars_fac$cyl, levels=rev(levels(mtcars_fac$cyl))) levels(mtcars_fac$cyl) ggplot(mtcars_fac,aes(x=cyl,y=hp)) + geom_point()+ xlab('Number of Cylinders') + ylab('Weight (x 1000lbs)') # or the whole dataset: mtcars_fac <- mtcars_fac[order(mtcars_fac$hp, decreasing = F), ] mtcars_fac$idu <- 1:nrow(mtcars_fac) mtcars_fac$car <- rownames(mtcars_fac) mtcars_fac$car <- factor(mtcars_fac$car) mtcars_fac$car <- factor(mtcars_fac$car, levels=mtcars_fac$car[order(mtcars_fac$hp, decreasing = F)]) ggplot(mtcars_fac,aes(x=car,y=hp)) + geom_bar(stat='identity') + xlab(NULL)+ ylab('Horsepower') # # # Pay attention to the "form" of mtcars: head(mtcars) # This is called the wide form because each variable is in it's own column. # We can use it by identifying grouping variables (e.g. columns) manually. # However, the ggplot (and entire tidyverse) universe is built on the concept of # long format tables. The long format of a table is usually unique. For the wide # format there are many possible ways of representing the same data. For example, # does a column represent cylinders, horsepowers, or a combination thereof? # # There is no satisfying way to convert between the two forms using only base R. # Luckily, there is tidyr # To be honest, that is a whole new world of working with your data that is worth looking into. # Here, we're just borrowing the functionality of pivoting data. So, having mtcars in long # format would look like this: mtcars$car <- rownames(mtcars) mtc_long <- pivot_longer(mtcars, cols=-car, names_to = "Measure", values_to="Value") head(mtc_long, n = 30) # Since mtcars isn't the best dataset to work with in long format, let's turn to # another example in wide format: head(iris) summary(iris) # Imagine, we want to compare dimensions of both Sepal and Petal providing # visual distinction between the three species. ggplot(iris)+ geom_point(aes(x=Sepal.Width, y=Sepal.Length, col=Species), shape=1)+ geom_point(aes(x=Petal.Width, y=Petal.Length, col=Species), shape=4) # This is all manual work... # If we'd have the data like this, things could be automated much easier: # # Species Part Measure Values ## 1 setosa Sepal Length 5.1 ## 2 setosa Sepal Width 3.5 ## 3 setosa Petal Length 1.4 ## 4 setosa Petal Width 0.2 ## ..... # # Once it is in this specific form, ggplot2 can group the entries based on values # in the Part & Measure columns, extracting x and y values correctly, for example. iris_plot <- pivot_longer(iris, cols=-Species, names_to = "Measure", values_to="Value") head(iris_plot) # great, this gives us the long form! ggplot(iris_plot, aes(x = Species, y = Value)) + geom_jitter() + facet_grid(cols = vars(Measure)) + theme_minimal() # but, at this point, we can not distinguish between Sepal and Petal in our plot... # The Measure column actually contains two different values... head(iris_plot) # That's why we make use of the additional separate function # it splits the values in the column based on any character that is not a letter or number # (it is achieved using pattern matching: sep = "[^[:alnum:]]+") ?separate iris_plot <- pivot_longer(iris, cols = -Species, names_to = "Measures", values_to = "Value") %>% separate(col = Measures, into = c("Part", "Measure")) # Now, there are two new columns head(iris_plot) # and we're able to use the values to highlight the different parts ggplot(iris_plot, aes(x = Species, y = Value, color=Part)) + geom_jitter() + facet_grid(cols = vars(Measure)) + theme_minimal() # # Often, when we want to convert the wide into the long form, the column names are actually values: table4a # again, the pivot_longer helps here t4a_plot <- pivot_longer(table4a, cols = c(`1999`, `2000`), names_to = "year", values_to = "cases") t4a_plot ggplot(t4a_plot, aes(x=year, y=cases, fill=country)) + geom_bar(position="dodge", stat="identity") ggplot(t4a_plot, aes(x=year, y=cases, fill=country)) + geom_bar(position="fill", stat="identity") # # Sometimes, you need it the other way around. table2 # as you can see here, cases and population refer to the same year describing one country. t2_plot <- pivot_wider(table2, names_from = type, values_from = count) t2_plot # Great resources: # http://r-statistics.co/Complete-Ggplot2-Tutorial-Part1-With-R-Code.html # http://www.sthda.com/english/wiki/explorer.php > R software > Data Visualization > ggplot2 Essentials (no deep-links available) # https://bookdown.org/sunboklee/introduction_to_r/intro.html