Lab5 Data Visualization
Learning Objects
This tutorial aims to introduce some basic examples of using r ggplot2 to do data visualization.
You should read Healy’s data visualization book for more details https://socviz.co/
You should also check ggplot2 cookbook for more details.
Replicating anti-AAPI hate speech data viz
The following codes are used to replicate the results of the paper I have worked on for a while. You can check here for the paper, data, and codes https://osf.io/xtw4c/. If you find any bugs, please let me know.
The goal is to show some basic academic visualization and its workflow.
if (!requireNamespace("pacman")) install.packages('pacman')
## Loading required namespace: pacman
library(pacman)
<-c("tidyverse","lubridate","glue",
packages"extrafont","ggthemes","ggrepel",
"patchwork","latex2exp",
"scales","cowplot")
p_load(packages,character.only = TRUE)
## ggplot themes and scales
<- function(base_size=11, base_family="Helvetica Neue") {
theme_Publication library(grid)
library(ggthemes)
theme_foundation(base_size=base_size)
(+ theme(plot.title = element_text(size = rel(1.2), hjust = 0.5),
text = element_text(),
panel.background = element_rect(colour = NA),
plot.background = element_rect(colour = NA),
panel.border = element_rect(colour = NA),
axis.title = element_text(size = rel(1)),
axis.title.y = element_text(angle=90,vjust =2),
axis.title.x = element_text(vjust = -0.2),
axis.text = element_text(),
axis.line = element_line(colour="black"),
axis.ticks = element_line(),
axis.ticks.length = unit(-1.4, "mm"),
axis.text.x = element_text(margin = unit(c(t = 2.5, r = 0, b = 0, l = 0), "mm")),
axis.text.y = element_text(margin = unit(c(t = 0, r = 2.5, b = 0, l = 0), "mm")),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.key = element_rect(colour = NA),
legend.position = "right",
# legend.direction = "horizontal",
# legend.key.size= unit(0.2, "cm"),
legend.margin = margin(t=0,unit="cm"),
legend.title = element_text(face="italic"),
plot.margin=unit(c(10,5,5,5),"mm"),
strip.background=element_rect(colour="#f0f0f0",fill="#f0f0f0"),
strip.text = element_text(face="bold")
))
}
<- function(...){
scale_fill_Publication library(scales)
discrete_scale("fill","Publication",
manual_pal(values = c("#386cb0","#fdb462",
"#7fc97f","#ef3b2c",
"#662506","#a6cee3",
"#fb9a99","#984ea3",
"#fa9fb5","#8856a7",
"#9ebcda","#e0ecf4",
"#f03b20","#feb24c")), ...)
}
<- function(...){
scale_colour_Publication library(scales)
discrete_scale("colour","Publication",
manual_pal(values = c("#386cb0","#fdb462",
"#7fc97f","#ef3b2c",
"#662506","#a6cee3",
"#fb9a99","#984ea3",
"#fa9fb5","#8856a7",
"#9ebcda","#e0ecf4",
"#f03b20","#feb24c")), ...)
}
# REPLICATE FIGURE 1
<- read_csv(url("https://yongjunzhang.com/files/css/dat_fig1.csv")) dat_fig1
## Rows: 684 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): classifier
## dbl (3): year, week, tweets
## date (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
%>%
dat_fig1 filter(classifier%in%c("All, COVID19 hate terms",
"Ziems-Hate",
"Davidson-Hate speech",
"Vidgen-entity_directed_hostility")) %>%
mutate(classifier=case_when(
=="All, COVID19 hate terms"~"All-COVID19 hate terms",
classifier=="Ziems-Hate"~"Ziems-Hate",
classifier=="Davidson-Hate speech"~"Davidson-Hate",
classifier=="Vidgen-entity_directed_hostility"~"Vidgen-Hostility"
classifier%>%
)) ggplot(aes(x=date,y=tweets,group=classifier,color=classifier,shape=classifier))+
geom_point(size=1)+
geom_line(size=.5)+
theme_Publication()+
scale_x_date(date_labels = "%b/%y", date_breaks = "2 month")+
#scale_y_continuous(breaks = seq(0,240,40),limits = c(0,240))+
labs(y="Tweets",x="Month/Year")+
scale_colour_Publication()+
geom_vline(xintercept = as.Date('2020-01-18'), col="blue", lwd=0.5, lty=2)+
geom_text(x = as.Date("2020-01-18"), y = 7, label = "Jan 18: New cases confirmed in Wuhan",
hjust=0, vjust= 0, colour="blue", check_overlap = TRUE)+
geom_vline(xintercept = as.Date('2020-03-16'), col="#ef3b2c", lwd=.5, lty=2)+
geom_text(x = as.Date("2020-03-16"), y = 7.5, label = "Mar 16: Trump Tweeted ChinaVirus",
hjust=0, vjust= 0, colour="#ef3b2c", check_overlap = TRUE)+
theme(legend.position = c(0.8, 0.9))+
scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x)),
limits=c(10^0,10^8))+
annotation_logticks()
ggsave("./fig1.eps",dpi=300,width=187,units="mm",height=100)
# REPLICATE FIGURE 2
<- read_csv(url("https://yongjunzhang.com/files/css/dat_fig2.csv")) data_fig2
## Rows: 1872 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): keywords, All
## dbl (3): year, month, tweets
## date (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
%>%
data_fig2 ggplot(aes(x=date,y=tweets+1,group=keywords,color=keywords,linetypes=keywords,shape=keywords))+
geom_point(size=.5)+
geom_line(size=.5)+
theme_Publication()+
scale_x_date(date_labels = "%b/%y", date_breaks = "12 month")+
#scale_y_continuous(breaks = seq(0,40,10),limits = c(0,40))+
labs(y="Tweets",x="Month/Year")+
scale_colour_Publication()+
geom_vline(xintercept = as.Date('2020-01-18'), col="blue", lwd=0.5, lty=2)+
geom_text(x = as.Date("2020-01-18"), y = 5.9, label = "Jan 18: New cases confirmed in Wuhan",
hjust=1, vjust=1, colour="blue", check_overlap = TRUE)+
scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x)),
limits=c(10^0,10^6))+
annotation_logticks()+
scale_shape_manual(values=0:24)+
theme(legend.position = "bottom")
ggsave("./fig2.eps",dpi=300,width=187,units="mm",height=160)
# REPLICATE FIGURE 3
<- read_csv(url("https://yongjunzhang.com/files/css/dat_fig3.csv")) dat_fig3
## Rows: 1060 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): classifier
## dbl (3): year, week, tweets
## date (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
%>%
dat_fig3 filter(classifier%in%c("All, anti-Chinese politics terms",
"Ziems-Hate",
"Davidson-Hate speech",
"Vidgen-entity_directed_hostility")) %>%
mutate(classifier=case_when(
=="All, anti-Chinese politics terms"~"All-anti Chinese terms",
classifier=="Ziems-Hate"~"Ziems-Hate",
classifier=="Davidson-Hate speech"~"Davidson-Hate",
classifier=="Vidgen-entity_directed_hostility"~"Vidgen-Hostility"
classifier%>%
)) ggplot(aes(x=date,y=tweets,group=classifier,color=classifier,shape=classifier))+
geom_point(size=1)+
geom_line(size=.5)+
theme_Publication()+
scale_x_date(date_labels = "%b/%y", date_breaks = "2 month")+
#scale_y_continuous(breaks = seq(0,240,40),limits = c(0,240))+
labs(y="Tweets",x="Month/Year")+
scale_colour_Publication()+
geom_vline(xintercept = as.Date('2020-01-18'), col="blue", lwd=0.5, lty=2)+
geom_text(x = as.Date("2020-01-18"), y = 7, label = "Jan 18: New cases confirmed in Wuhan",
hjust=0, vjust= 0, colour="blue", check_overlap = TRUE)+
geom_vline(xintercept = as.Date('2020-03-16'), col="#ef3b2c", lwd=.5, lty=2)+
geom_text(x = as.Date("2020-03-16"), y = 7.5, label = "Mar 16: Trump Tweeted ChinaVirus",
hjust=0, vjust= 0, colour="#ef3b2c", check_overlap = TRUE)+
theme(legend.position = c(0.8, 0.2))+
scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x)),
limits=c(10^0,10^8))+
annotation_logticks()
ggsave("./fig3.eps",dpi=300,width=187,units="mm",height=130)
# REPLICATE FIGURE 4
<- read_csv(url("https://yongjunzhang.com/files/css/dat_fig4.csv")) dat_fig4
## Rows: 211 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): year, week, tweets
## date (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
%>%
dat_fig4 ggplot(aes(x=date,y=tweets))+
geom_point(size=1,color="#feb24c")+
geom_line(size=.5,color="#feb24c")+
theme_Publication()+
scale_x_date(date_labels = "%b/%y", date_breaks = "2 month")+
#scale_y_continuous(breaks = seq(0,240,40),limits = c(0,240))+
labs(y="Tweets",x="Month/Year")+
scale_colour_Publication()+
geom_vline(xintercept = as.Date('2020-01-18'), col="blue", lwd=0.5, lty=2)+
geom_text(x = as.Date("2020-01-18"), y = 5, label = "Jan 18: New cases confirmed in Wuhan",
hjust=0, vjust= 0, colour="blue", check_overlap = TRUE)+
geom_vline(xintercept = as.Date('2020-03-16'), col="#ef3b2c", lwd=.5, lty=2)+
geom_text(x = as.Date("2020-03-16"), y = 7.5, label = "Mar 16: Trump Tweeted ChinaVirus",
hjust=0, vjust= 0, colour="#ef3b2c", check_overlap = TRUE)+
geom_vline(xintercept = as.Date('2021-03-16'), col="black", lwd=.5, lty=2)+
geom_text(x = as.Date("2021-03-16"), y = 7, label = "Mar 16: Atlanta Spa Mass Shootings",
hjust=1, vjust= 1, colour="black", check_overlap = TRUE)+
theme(legend.position = c(0.8, 0.8))+
scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x)),
limits=c(10^0,10^8))+
annotation_logticks()
ggsave("./fig4.eps",dpi=300,width=187,units="mm",height=100)
# REPLICATE FIGURE 5
<- read_csv(url("https://yongjunzhang.com/files/css/dat_fig5.csv")) dat_fig5
## Rows: 40 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): data, hashtag
## dbl (1): tweets
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
<- dat_fig5 %>%
(a filter(data=="Anti-Chinese Politics") %>%
ggplot() +
geom_bar(aes(x=hashtag,y=tweets),
width = 0.2,
position="dodge",
stat="identity")+
coord_flip()+
labs(y="",x="")+
scale_colour_Publication()+
scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x)),
limits=c(10^0,10^8))+
annotation_logticks(sides = "b")+
theme_Publication()
)
<- dat_fig5%>%
b filter(data=="Counter Hate") %>%
ggplot() +
geom_bar(aes(x=hashtag,y=tweets),
width = 0.2,
position="dodge",
stat="identity")+
coord_flip()+
labs(y="",x="")+
scale_colour_Publication()+
scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x)),
limits=c(10^0,10^8))+
annotation_logticks(sides = "b")+
theme_Publication()
<- dat_fig5 %>%
c filter(data== "COVID-Specific Hate") %>%
ggplot() +
geom_bar(aes(x=hashtag,y=tweets),
width = 0.2,
position="dodge",
stat="identity")+
coord_flip()+
labs(y="",x="")+
scale_colour_Publication()+
scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x)),
limits=c(10^0,10^8))+
annotation_logticks(sides = "b")+
theme_Publication()
<- dat_fig5%>%
d filter(data=="General Anti-AAPI Hate") %>%
ggplot() +
geom_bar(aes(x=hashtag,y=tweets),
width = 0.2,
position="dodge",
stat="identity")+
coord_flip()+
labs(y="",x="")+
scale_colour_Publication()+
scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x)),
limits=c(10^0,10^8))+
annotation_logticks(sides = "b")+
theme_Publication()
library(patchwork)
|b)/(c|d)+ plot_annotation(tag_levels = 'A') (a
ggsave("./fig5.eps",dpi=300,width=187,units="mm",height=200)
# REPLICATE FIGURE 6
<- read_csv(url("https://yongjunzhang.com/files/css/dat_fig6a.csv")) dat_fig6a
## Rows: 164 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): year, month, COVID tweets, AAPI tweets
## date (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
<- read_csv(url("https://yongjunzhang.com/files/css/dat_fig6b.csv")) dat_fig6b
## Rows: 71 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): year, week, COVID tweets, Anti-Chinese Politics tweets
## date (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
<- dat_fig6a %>%
a filter(year>2007) %>%
pivot_longer(cols = c(`COVID tweets`,`AAPI tweets`),names_to="source",values_to="tweets") %>%
ggplot(aes(x=date,y=tweets,group=source,color=source,linetype=source))+
geom_point(size=1)+
geom_line(size=.5)+
theme_Publication()+
scale_x_date(date_labels = "%b/%y", date_breaks = "12 month")+
#scale_y_continuous(breaks = seq(0,240,40),limits = c(0,240))+
labs(y="Tweets",x="Month/Year")+
scale_colour_Publication()+
theme(legend.position = c(0.8, 0.8))+
scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x)),
limits=c(10^0,10^8))+
annotation_logticks()+
geom_vline(xintercept = as.Date('2020-01-18'), col="black", lwd=0.5, lty=2)+
geom_text(x = as.Date("2020-01-18"), y = 3, label = "Jan 18: New cases confirmed in Wuhan",
hjust=1, vjust= 1, colour="black", check_overlap = TRUE)
<- dat_fig6b %>%
b pivot_longer(cols = c(`COVID tweets`,`Anti-Chinese Politics tweets`),names_to="source",values_to="tweets") %>%
ggplot(aes(x=date,y=tweets,group=source,color=source,linetype=source))+
geom_point(size=1)+
geom_line(size=.5)+
theme_Publication()+
scale_x_date(date_labels = "%b/%y", date_breaks = "12 month")+
#scale_y_continuous(breaks = seq(0,240,40),limits = c(0,240))+
labs(y="Tweets",x="Month/Year")+
scale_colour_Publication()+
theme(legend.position = c(0.8, 0.8))+
scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x)),
limits=c(10^0,10^8))+
annotation_logticks()+
geom_vline(xintercept = as.Date('2020-01-18'), col="black", lwd=0.5, lty=2)+
geom_text(x = as.Date("2020-01-18"), y = 3, label = "Jan 18: New cases confirmed in Wuhan",
hjust=1, vjust= 1, colour="black", check_overlap = TRUE)
/b+ plot_annotation(tag_levels = 'A') a
## Warning: Removed 143 rows containing missing values (geom_point).
## Warning: Removed 143 row(s) containing missing values (geom_path).
## Warning: Removed 14 rows containing missing values (geom_point).
## Warning: Removed 13 row(s) containing missing values (geom_path).
ggsave("./fig6.eps",dpi=300,width=187,units="mm",height=100)
## Warning: Removed 143 rows containing missing values (geom_point).
## Warning: Removed 143 row(s) containing missing values (geom_path).
## Warning: Removed 14 rows containing missing values (geom_point).
## Warning: Removed 13 row(s) containing missing values (geom_path).