# Install packages
if (!requireNamespace("ggplot2", quietly = TRUE)) {
install.packages("ggplot2")
}if (!requireNamespace("dplyr", quietly = TRUE)) {
install.packages("dplyr")
}if (!requireNamespace("ggpubr", quietly = TRUE)) {
install.packages("ggpubr")
}if (!requireNamespace("ggthemes", quietly = TRUE)) {
install.packages("ggthemes")
}
# Load packages
library(ggplot2)
library(dplyr)
library(ggpubr)
library(ggthemes)
Half Violin
The half violin plot is a statistical graph used to display the distribution and probability density of data by replacing the left part with the data frequency count graph on the basis of keeping the right part of violin graph.
Setup
System Requirements: Cross-platform (Linux/MacOS/Windows)
Programming language: R
Dependent packages:
ggplot2
;dplyr
;ggpubr
;ggthemes
Data Preparation
The loaded data is data set (gene names and expression levels in different tumors).
# Load data
<- read.delim("files/Hiplot/085-half-violin-data.txt", header = T)
data
# Convert data structure
colnames(data) <- c("Value", "Group")
2] <- factor(data[, 2], levels = unique(data[, 2]))
data[,
# View data
head(data)
Value Group
1 12.10228 AML
2 12.61382 AML
3 12.52741 AML
4 12.67990 AML
5 12.64837 AML
6 12.12146 AML
Visualization
# Half Violin
<- function(
geom_flat_violin mapping = NULL, data = NULL, stat = "ydensity", position = "dodge",
trim = TRUE, scale = "area", show.legend = NA, inherit.aes = TRUE, ...) {
::layer(data = data, mapping = mapping, stat = stat,
ggplot2geom = geom_flat_violin_proto, position = position,
show.legend = show.legend, inherit.aes = inherit.aes,
params = list(trim = trim, scale = scale, ...))
}
"%||%" <- function(a, b) {
if (!is.null(a)) {
aelse {
}
b
}
}
<-
geom_flat_violin_proto ggproto("geom_flat_violin_proto", Geom,
setup_data = function(data, params) {
$width <- data$width %||%
data$width %||% (resolution(data$x, FALSE) * 0.9)
params
%>%
data ::group_by(.data = ., group) %>%
dplyr::mutate(.data = ., ymin = min(y), ymax = max(y), xmin = x,
dplyrxmax = x + width / 2)
},
draw_group = function(data, panel_scales, coord) {
<- base::transform(data, xminv = x,
data xmaxv = x + violinwidth * (xmax - x))
<- base::rbind(
newdata ::arrange(.data = base::transform(data, x = xminv), y),
dplyr::arrange(.data = base::transform(data, x = xmaxv), -y))
dplyr
<- rbind(newdata, newdata[1, ])
newdata
:::ggname("geom_flat_violin",
ggplot2$draw_panel(newdata, panel_scales, coord))
GeomPolygon
},
draw_key = draw_key_polygon,
default_aes = ggplot2::aes(weight = 1, colour = "grey20", fill = "white",
size = 0.5, alpha = NA, linetype = "solid"),
required_aes = c("x", "y")
)
<- ggplot(data = data, aes(Group, Value, fill = Group)) +
p geom_flat_violin(alpha = 1, scale = "count", trim = FALSE) +
geom_boxplot(width = 0.05, fill = "white", alpha = 1,
outlier.colour = NA, position = position_nudge(0.05)) +
stat_summary(fun = mean, geom = "point", fill = "white", shape = 21, size = 2,
position = position_nudge(0.05)) +
geom_dotplot(alpha = 1, binaxis = "y", dotsize = 0.5, stackdir = "down",
binwidth = 0.1, position = position_nudge(-0.025)) +
theme(legend.position = "none") +
xlab(colnames(data)[2]) +
ylab(colnames(data)[1]) +
guides(fill = F) +
ggtitle("Half Violin Plot") +
scale_fill_manual(values = c("#e04d39","#5bbad6","#1e9f86")) +
theme_stata() +
theme(text = element_text(family = "Arial"),
plot.title = element_text(size = 12,hjust = 0.5),
axis.title = element_text(size = 12),
axis.text = element_text(size = 10),
axis.text.x = element_text(angle = 0, hjust = 0.5,vjust = 1),
legend.position = "right",
legend.direction = "vertical",
legend.title = element_text(size = 10),
legend.text = element_text(size = 10))
p

The half violin plot can reflect the data distribution, which is similar to the box diagram. The black horizontal line in the box shows the median gene expression level in each tumor, and the upper and lower edges in the white box represent the upper and lower quartiles in the data set. The distribution of observable numerical points on the left half; The violin graph can also reflect the data density, and the more concentrated the data set, the fatter the graph. The gene expression distribution in the BLGG group is more concentrated, followed by BIC group and AML group.