Lab5 Data Visualization

Learning Objects

This tutorial aims to introduce some basic examples of using r ggplot2 to do data visualization.

You should read Healy’s data visualization book for more details https://socviz.co/
You should also check ggplot2 cookbook for more details.

Replicating anti-AAPI hate speech data viz

The following codes are used to replicate the results of the paper I have worked on for a while. You can check here for the paper, data, and codes https://osf.io/xtw4c/. If you find any bugs, please let me know.

The goal is to show some basic academic visualization and its workflow.

if (!requireNamespace("pacman")) install.packages('pacman')

## Loading required namespace: pacman

library(pacman)

packages<-c("tidyverse","lubridate","glue",
            "extrafont","ggthemes","ggrepel",
            "patchwork","latex2exp",
            "scales","cowplot")
p_load(packages,character.only = TRUE)


## ggplot themes and scales
theme_Publication <- function(base_size=11, base_family="Helvetica Neue") {
  library(grid)
  library(ggthemes)
  (theme_foundation(base_size=base_size)
    + theme(plot.title = element_text(size = rel(1.2), hjust = 0.5),
            text = element_text(),
            panel.background = element_rect(colour = NA),
            plot.background = element_rect(colour = NA),
            panel.border = element_rect(colour = NA),
            axis.title = element_text(size = rel(1)),
            axis.title.y = element_text(angle=90,vjust =2),
            axis.title.x = element_text(vjust = -0.2),
            axis.text = element_text(), 
            axis.line = element_line(colour="black"),
            axis.ticks = element_line(),
            axis.ticks.length = unit(-1.4, "mm"),
            axis.text.x = element_text(margin = unit(c(t = 2.5, r = 0, b = 0, l = 0), "mm")),
            axis.text.y = element_text(margin = unit(c(t = 0, r = 2.5, b = 0, l = 0), "mm")),
            panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(),
            legend.key = element_rect(colour = NA),
            legend.position = "right",
            #            legend.direction = "horizontal",
            #            legend.key.size= unit(0.2, "cm"),
            legend.margin = margin(t=0,unit="cm"),
            legend.title = element_text(face="italic"),
            plot.margin=unit(c(10,5,5,5),"mm"),
            strip.background=element_rect(colour="#f0f0f0",fill="#f0f0f0"),
            strip.text = element_text(face="bold")
    ))
  
}

scale_fill_Publication <- function(...){
  library(scales)
  discrete_scale("fill","Publication",
                 manual_pal(values = c("#386cb0","#fdb462",
                                       "#7fc97f","#ef3b2c",
                                       "#662506","#a6cee3",
                                       "#fb9a99","#984ea3",
                                       "#fa9fb5","#8856a7",
                                       "#9ebcda","#e0ecf4",
                                       "#f03b20","#feb24c")), ...)
  
}

scale_colour_Publication <- function(...){
  library(scales)
  discrete_scale("colour","Publication",
                 manual_pal(values = c("#386cb0","#fdb462",
                                       "#7fc97f","#ef3b2c",
                                       "#662506","#a6cee3",
                                       "#fb9a99","#984ea3",
                                       "#fa9fb5","#8856a7",
                                       "#9ebcda","#e0ecf4",
                                       "#f03b20","#feb24c")), ...)
  
}


# REPLICATE FIGURE 1


dat_fig1 <- read_csv(url("https://yongjunzhang.com/files/css/dat_fig1.csv"))

## Rows: 684 Columns: 5

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): classifier
## dbl  (3): year, week, tweets
## date (1): date

## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

dat_fig1 %>% 
  filter(classifier%in%c("All, COVID19 hate terms",
                         "Ziems-Hate",
                         "Davidson-Hate speech",
                         "Vidgen-entity_directed_hostility")) %>% 
  mutate(classifier=case_when(
    classifier=="All, COVID19 hate terms"~"All-COVID19 hate terms",
    classifier=="Ziems-Hate"~"Ziems-Hate",
    classifier=="Davidson-Hate speech"~"Davidson-Hate",
    classifier=="Vidgen-entity_directed_hostility"~"Vidgen-Hostility"
  )) %>% 
  ggplot(aes(x=date,y=tweets,group=classifier,color=classifier,shape=classifier))+
  geom_point(size=1)+
  geom_line(size=.5)+
  theme_Publication()+
  scale_x_date(date_labels = "%b/%y", date_breaks = "2 month")+
  #scale_y_continuous(breaks = seq(0,240,40),limits = c(0,240))+
  labs(y="Tweets",x="Month/Year")+
  scale_colour_Publication()+
  geom_vline(xintercept = as.Date('2020-01-18'), col="blue", lwd=0.5, lty=2)+
  geom_text(x = as.Date("2020-01-18"), y = 7, label = "Jan 18: New cases confirmed in Wuhan", 
            hjust=0, vjust= 0,  colour="blue", check_overlap = TRUE)+
  geom_vline(xintercept = as.Date('2020-03-16'), col="#ef3b2c", lwd=.5, lty=2)+
  geom_text(x = as.Date("2020-03-16"), y = 7.5, label = "Mar 16: Trump Tweeted ChinaVirus", 
            hjust=0, vjust= 0, colour="#ef3b2c", check_overlap = TRUE)+
  theme(legend.position = c(0.8, 0.9))+
  scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),
                labels = trans_format("log10", math_format(10^.x)),
                limits=c(10^0,10^8))+
  annotation_logticks()

ggsave("./fig1.eps",dpi=300,width=187,units="mm",height=100)



# REPLICATE FIGURE 2


data_fig2 <- read_csv(url("https://yongjunzhang.com/files/css/dat_fig2.csv"))

## Rows: 1872 Columns: 6

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): keywords, All
## dbl  (3): year, month, tweets
## date (1): date

## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

data_fig2 %>% 
    ggplot(aes(x=date,y=tweets+1,group=keywords,color=keywords,linetypes=keywords,shape=keywords))+
    geom_point(size=.5)+
    geom_line(size=.5)+
    theme_Publication()+
    scale_x_date(date_labels = "%b/%y", date_breaks = "12 month")+
    #scale_y_continuous(breaks = seq(0,40,10),limits = c(0,40))+
    labs(y="Tweets",x="Month/Year")+
    scale_colour_Publication()+
    geom_vline(xintercept = as.Date('2020-01-18'), col="blue", lwd=0.5, lty=2)+
    geom_text(x = as.Date("2020-01-18"), y = 5.9, label = "Jan 18: New cases confirmed in Wuhan", 
              hjust=1, vjust=1, colour="blue", check_overlap = TRUE)+
    scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),
                  labels = trans_format("log10", math_format(10^.x)),
                  limits=c(10^0,10^6))+
    annotation_logticks()+
    scale_shape_manual(values=0:24)+
    theme(legend.position = "bottom")

ggsave("./fig2.eps",dpi=300,width=187,units="mm",height=160)




# REPLICATE FIGURE 3


dat_fig3 <- read_csv(url("https://yongjunzhang.com/files/css/dat_fig3.csv"))

## Rows: 1060 Columns: 5

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): classifier
## dbl  (3): year, week, tweets
## date (1): date

## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

dat_fig3 %>% 
  filter(classifier%in%c("All, anti-Chinese politics terms",
                         "Ziems-Hate",
                         "Davidson-Hate speech",
                         "Vidgen-entity_directed_hostility")) %>% 
  mutate(classifier=case_when(
    classifier=="All, anti-Chinese politics terms"~"All-anti Chinese terms",
    classifier=="Ziems-Hate"~"Ziems-Hate",
    classifier=="Davidson-Hate speech"~"Davidson-Hate",
    classifier=="Vidgen-entity_directed_hostility"~"Vidgen-Hostility"
  )) %>% 
  ggplot(aes(x=date,y=tweets,group=classifier,color=classifier,shape=classifier))+
  geom_point(size=1)+
  geom_line(size=.5)+
  theme_Publication()+
  scale_x_date(date_labels = "%b/%y", date_breaks = "2 month")+
  #scale_y_continuous(breaks = seq(0,240,40),limits = c(0,240))+
  labs(y="Tweets",x="Month/Year")+
  scale_colour_Publication()+
  geom_vline(xintercept = as.Date('2020-01-18'), col="blue", lwd=0.5, lty=2)+
  geom_text(x = as.Date("2020-01-18"), y = 7, label = "Jan 18: New cases confirmed in Wuhan", 
            hjust=0, vjust= 0,  colour="blue", check_overlap = TRUE)+
  geom_vline(xintercept = as.Date('2020-03-16'), col="#ef3b2c", lwd=.5, lty=2)+
  geom_text(x = as.Date("2020-03-16"), y = 7.5, label = "Mar 16: Trump Tweeted ChinaVirus", 
            hjust=0, vjust= 0, colour="#ef3b2c", check_overlap = TRUE)+
  theme(legend.position = c(0.8, 0.2))+
  scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),
                labels = trans_format("log10", math_format(10^.x)),
                limits=c(10^0,10^8))+
  annotation_logticks()

ggsave("./fig3.eps",dpi=300,width=187,units="mm",height=130)



# REPLICATE FIGURE 4


dat_fig4 <- read_csv(url("https://yongjunzhang.com/files/css/dat_fig4.csv"))

## Rows: 211 Columns: 4

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (3): year, week, tweets
## date (1): date

## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

dat_fig4 %>% 
  ggplot(aes(x=date,y=tweets))+
  geom_point(size=1,color="#feb24c")+
  geom_line(size=.5,color="#feb24c")+
  theme_Publication()+
  scale_x_date(date_labels = "%b/%y", date_breaks = "2 month")+
  #scale_y_continuous(breaks = seq(0,240,40),limits = c(0,240))+
  labs(y="Tweets",x="Month/Year")+
  scale_colour_Publication()+
  geom_vline(xintercept = as.Date('2020-01-18'), col="blue", lwd=0.5, lty=2)+
  geom_text(x = as.Date("2020-01-18"), y = 5, label = "Jan 18: New cases confirmed in Wuhan", 
            hjust=0, vjust= 0,  colour="blue", check_overlap = TRUE)+
  geom_vline(xintercept = as.Date('2020-03-16'), col="#ef3b2c", lwd=.5, lty=2)+
  geom_text(x = as.Date("2020-03-16"), y = 7.5, label = "Mar 16: Trump Tweeted ChinaVirus", 
            hjust=0, vjust= 0, colour="#ef3b2c", check_overlap = TRUE)+
  geom_vline(xintercept = as.Date('2021-03-16'), col="black", lwd=.5, lty=2)+
  geom_text(x = as.Date("2021-03-16"), y = 7, label = "Mar 16: Atlanta Spa Mass Shootings", 
            hjust=1, vjust= 1, colour="black", check_overlap = TRUE)+
  theme(legend.position = c(0.8, 0.8))+
  scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),
                labels = trans_format("log10", math_format(10^.x)),
                limits=c(10^0,10^8))+
  annotation_logticks()

ggsave("./fig4.eps",dpi=300,width=187,units="mm",height=100)



# REPLICATE FIGURE 5


dat_fig5 <- read_csv(url("https://yongjunzhang.com/files/css/dat_fig5.csv"))

## Rows: 40 Columns: 3

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): data, hashtag
## dbl (1): tweets

## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

(a <- dat_fig5 %>% 
    filter(data=="Anti-Chinese Politics") %>% 
    ggplot() +
    geom_bar(aes(x=hashtag,y=tweets),
             width = 0.2,
             position="dodge", 
             stat="identity")+
    coord_flip()+
    labs(y="",x="")+
    scale_colour_Publication()+
    scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),
                  labels = trans_format("log10", math_format(10^.x)),
                  limits=c(10^0,10^8))+
    annotation_logticks(sides = "b")+
    theme_Publication()
)

b <- dat_fig5%>% 
  filter(data=="Counter Hate") %>% 
  ggplot() +
  geom_bar(aes(x=hashtag,y=tweets),
           width = 0.2,
           position="dodge", 
           stat="identity")+
  coord_flip()+
  labs(y="",x="")+
  scale_colour_Publication()+
  scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),
                labels = trans_format("log10", math_format(10^.x)),
                limits=c(10^0,10^8))+
  annotation_logticks(sides = "b")+
  theme_Publication()

c <- dat_fig5 %>% 
  filter(data== "COVID-Specific Hate") %>% 
  ggplot() +
  geom_bar(aes(x=hashtag,y=tweets),
           width = 0.2,
           position="dodge", 
           stat="identity")+
  coord_flip()+
  labs(y="",x="")+
  scale_colour_Publication()+
  scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),
                labels = trans_format("log10", math_format(10^.x)),
                limits=c(10^0,10^8))+
  annotation_logticks(sides = "b")+
  theme_Publication()

d <- dat_fig5%>% 
  filter(data=="General Anti-AAPI Hate") %>% 
  ggplot() +
  geom_bar(aes(x=hashtag,y=tweets),
           width = 0.2,
           position="dodge", 
           stat="identity")+
  coord_flip()+
  labs(y="",x="")+
  scale_colour_Publication()+
  scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),
                labels = trans_format("log10", math_format(10^.x)),
                limits=c(10^0,10^8))+
  annotation_logticks(sides = "b")+
  theme_Publication()

library(patchwork)

(a|b)/(c|d)+ plot_annotation(tag_levels = 'A')

ggsave("./fig5.eps",dpi=300,width=187,units="mm",height=200)



# REPLICATE FIGURE 6

dat_fig6a <- read_csv(url("https://yongjunzhang.com/files/css/dat_fig6a.csv"))

## Rows: 164 Columns: 5

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (4): year, month, COVID tweets, AAPI tweets
## date (1): date

## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

dat_fig6b <- read_csv(url("https://yongjunzhang.com/files/css/dat_fig6b.csv"))

## Rows: 71 Columns: 5

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (4): year, week, COVID tweets, Anti-Chinese Politics tweets
## date (1): date

## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

a <- dat_fig6a %>% 
  filter(year>2007) %>% 
  pivot_longer(cols = c(`COVID tweets`,`AAPI tweets`),names_to="source",values_to="tweets") %>% 
  ggplot(aes(x=date,y=tweets,group=source,color=source,linetype=source))+
  geom_point(size=1)+
  geom_line(size=.5)+
  theme_Publication()+
  scale_x_date(date_labels = "%b/%y", date_breaks = "12 month")+
  #scale_y_continuous(breaks = seq(0,240,40),limits = c(0,240))+
  labs(y="Tweets",x="Month/Year")+
  scale_colour_Publication()+
  theme(legend.position = c(0.8, 0.8))+
  scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),
                labels = trans_format("log10", math_format(10^.x)),
                limits=c(10^0,10^8))+
  annotation_logticks()+
  geom_vline(xintercept = as.Date('2020-01-18'), col="black", lwd=0.5, lty=2)+
  geom_text(x = as.Date("2020-01-18"), y = 3, label = "Jan 18: New cases confirmed in Wuhan", 
            hjust=1, vjust= 1,  colour="black", check_overlap = TRUE)

b <- dat_fig6b %>% 
  pivot_longer(cols = c(`COVID tweets`,`Anti-Chinese Politics tweets`),names_to="source",values_to="tweets") %>% 
  ggplot(aes(x=date,y=tweets,group=source,color=source,linetype=source))+
  geom_point(size=1)+
  geom_line(size=.5)+
  theme_Publication()+
  scale_x_date(date_labels = "%b/%y", date_breaks = "12 month")+
  #scale_y_continuous(breaks = seq(0,240,40),limits = c(0,240))+
  labs(y="Tweets",x="Month/Year")+
  scale_colour_Publication()+
  theme(legend.position = c(0.8, 0.8))+
  scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),
                labels = trans_format("log10", math_format(10^.x)),
                limits=c(10^0,10^8))+
  annotation_logticks()+
  geom_vline(xintercept = as.Date('2020-01-18'), col="black", lwd=0.5, lty=2)+
  geom_text(x = as.Date("2020-01-18"), y = 3, label = "Jan 18: New cases confirmed in Wuhan", 
            hjust=1, vjust= 1,  colour="black", check_overlap = TRUE)


a/b+ plot_annotation(tag_levels = 'A')

## Warning: Removed 143 rows containing missing values (geom_point).

## Warning: Removed 143 row(s) containing missing values (geom_path).

## Warning: Removed 14 rows containing missing values (geom_point).

## Warning: Removed 13 row(s) containing missing values (geom_path).

ggsave("./fig6.eps",dpi=300,width=187,units="mm",height=100)

## Warning: Removed 143 rows containing missing values (geom_point).

## Warning: Removed 143 row(s) containing missing values (geom_path).

## Warning: Removed 14 rows containing missing values (geom_point).

## Warning: Removed 13 row(s) containing missing values (geom_path).

Lab5 Data Visualization

Lab5 Data Visualization

Learning Objects

Replicating anti-AAPI hate speech data viz

THE END…