library(PubMatrixR)
library(knitr)
library(kableExtra)
library(dplyr)
library(pheatmap)
library(ggplot2)PubMatrixR is an R package designed to analyze literature co-occurrence patterns by systematically searching PubMed and PMC databases. This vignette demonstrates how to:
This package is a heavily fork of the original PubMatrixR by tslaird. Our gratitude to the original author.
For better performance and higher rate limits, we recommend obtaining an NCBI API key:
To obtain your free NCBI API key, visit: https://support.nlm.nih.gov/kbArticle/?pn=KA-05317
Once you have your API key, you can use it in PubMatrixR like this:
For this example, we’ll extract genes related to WNT signaling and obesity from the MSigDB database:
msigdf is optional and not required to build this
vignette. The example below is shown for reference only and is not
executed during package checks.
# Extract WNT-related genes
A <- msigdf::msigdf.human %>%
dplyr::filter(grepl(geneset, pattern = "wnt", ignore.case = TRUE)) %>%
dplyr::pull(symbol) %>%
unique()
# Extract obesity-related genes
B <- msigdf::msigdf.human %>%
dplyr::filter(grepl(geneset, pattern = "obesity", ignore.case = TRUE)) %>%
dplyr::pull(symbol) %>%
unique()
# Sample genes for demonstration (making them equal in length)
A <- sample(A, 10, replace = FALSE)
B <- sample(B, 10, replace = FALSE)Now we’ll search for co-occurrences between our gene sets in PubMed literature:
The co-occurrence matrix shows the number of publications mentioning each pair of genes:
kable(result,
caption = "Co-occurrence Matrix: WNT Genes vs Obesity Genes (Publication Counts)",
align = "c",
format = if (knitr::pandoc_to() == "html") "html" else "markdown"
) %>%
kableExtra::kable_styling(
bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE,
position = "center"
) %>%
kableExtra::add_header_above(c(" " = 1, "Obesity Genes" = length(B)))| WNT1 | WNT2 | WNT3A | WNT5A | WNT7B | CTNNB1 | DVL1 | |
|---|---|---|---|---|---|---|---|
| LEPR | 26 | 35 | 32 | 41 | 45 | 54 | 51 |
| ADIPOQ | 36 | 34 | 39 | 49 | 54 | 52 | 62 |
| PPARG | 34 | 40 | 51 | 57 | 51 | 62 | 68 |
| TNF | 44 | 51 | 58 | 53 | 60 | 72 | 79 |
| IL6 | 49 | 57 | 53 | 61 | 69 | 77 | 73 |
| ADRB2 | 59 | 56 | 65 | 74 | 78 | 75 | 84 |
| INSR | 57 | 67 | 72 | 82 | 75 | 85 | 95 |
Let’s first examine which genes have the highest publication counts:
# Create data frame for List A genes (rows) colored by List B genes (columns)
a_genes_data <- data.frame(
gene = rownames(result),
total_pubs = rowSums(result),
stringsAsFactors = FALSE
)
# Add color coding based on max overlap with B genes
a_genes_data$max_b_gene <- apply(result, 1, function(x) colnames(result)[which.max(x)])
a_genes_data$max_overlap <- apply(result, 1, max)
# Create data frame for List B genes (columns) colored by List A genes (rows)
b_genes_data <- data.frame(
gene = colnames(result),
total_pubs = colSums(result),
stringsAsFactors = FALSE
)
# Add color coding based on max overlap with A genes
b_genes_data$max_a_gene <- apply(result, 2, function(x) rownames(result)[which.max(x)])
b_genes_data$max_overlap <- apply(result, 2, max)
bar_plot_theme <- theme_minimal(base_size = 12) +
theme(
plot.title = element_text(face = "bold", size = 15),
plot.subtitle = element_text(size = 10),
axis.title = element_text(size = 11),
axis.text = element_text(size = 10),
panel.grid.major.y = element_blank(),
legend.position = "bottom",
legend.title = element_text(size = 10),
legend.text = element_text(size = 9),
legend.box = "vertical"
)
n_fill_a <- length(unique(a_genes_data$max_b_gene))
n_fill_b <- length(unique(b_genes_data$max_a_gene))
# Plot A genes colored by their strongest B gene partner
p1 <- ggplot(a_genes_data, aes(x = reorder(gene, total_pubs), y = total_pubs, fill = max_b_gene)) +
geom_col(width = 0.75) +
coord_flip() +
labs(
title = "List A Genes by Publication Count",
subtitle = "Colored by strongest List B partner",
x = "Genes (List A)",
y = "Total Publications",
fill = "Strongest B Partner"
) +
scale_fill_viridis_d(option = "D", end = 0.9) +
guides(fill = guide_legend(nrow = max(1, ceiling(n_fill_a / 4)), byrow = TRUE)) +
bar_plot_theme
# Plot B genes colored by their strongest A gene partner
p2 <- ggplot(b_genes_data, aes(x = reorder(gene, total_pubs), y = total_pubs, fill = max_a_gene)) +
geom_col(width = 0.75) +
coord_flip() +
labs(
title = "List B Genes by Publication Count",
subtitle = "Colored by strongest List A partner",
x = "Genes (List B)",
y = "Total Publications",
fill = "Strongest A Partner"
) +
scale_fill_viridis_d(option = "D", end = 0.9) +
guides(fill = guide_legend(nrow = max(1, ceiling(n_fill_b / 4)), byrow = TRUE)) +
bar_plot_theme
print(p1)The heatmap displays overlap percentages calculated from the publication co-occurrence counts:
plot_pubmatrix_heatmap(result,
title = "WNT-Obesity Overlap (%)",
show_numbers = TRUE,
cellwidth = 44,
cellheight = 32,
width = 12,
height = 10
)Overlap percentage heatmap with values displayed in each cell
For a cleaner visualization without numbers, useful for presentations:
plot_pubmatrix_heatmap(result,
title = "WNT-Obesity Co-occurrence (Clean)",
show_numbers = FALSE,
cellwidth = 44,
cellheight = 32,
width = 12,
height = 10
)Co-occurrence heatmap without numbers for better visual clarity
Asymmetric lists
# Intentionally not run during package checks: this performs live NCBI queries.
A <- c("NCOR2", "NCSTN", "NKD1", "NOTCH1", "NOTCH4", "NUMB", "PPARD", "PSEN2", "PTCH1", "RBPJ", "SKP2", "TCF7", "TP53")
B <- c("EIF1", "EIF1AX", "EIF2B1", "EIF2B2", "EIF2B3", "EIF2B4", "EIF2B5", "EIF2S1", "EIF2S2", "EIF2S3", "ELAVL1")
# Run actual PubMatrix analysis
current_year <- as.integer(format(Sys.Date(), "%Y"))
result <- PubMatrix(
A = A,
B = B,
Database = "pubmed",
daterange = c(2020, current_year),
outfile = "pubmatrix_result"
)# Offline deterministic example used for vignette rendering/package checks.
A <- c("NCOR2", "NCSTN", "NKD1", "NOTCH1", "NOTCH4", "NUMB", "PPARD", "PSEN2", "PTCH1", "RBPJ", "SKP2", "TCF7", "TP53")
B <- c("EIF1", "EIF1AX", "EIF2B1", "EIF2B2", "EIF2B3", "EIF2B4", "EIF2B5", "EIF2S1", "EIF2S2", "EIF2S3", "ELAVL1")
result <- outer(seq_along(B), seq_along(A), function(i, j) {
12 + (i * 4) + (j * 3) + ((i * j) %% 9) + ((i + 2 * j) %% 6)
})
result <- as.data.frame(result, check.names = FALSE)
colnames(result) <- A
rownames(result) <- BThe co-occurrence matrix shows the number of publications mentioning each pair of genes:
kable(result,
caption = "Co-occurrence Matrix: Longer Lists (Publication Counts)",
align = "c",
format = if (knitr::pandoc_to() == "html") "html" else "markdown"
) %>%
kableExtra::kable_styling(
bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE,
position = "center"
) %>%
kableExtra::add_header_above(c(" " = 1, "A Genes" = length(A)))| NCOR2 | NCSTN | NKD1 | NOTCH1 | NOTCH4 | NUMB | PPARD | PSEN2 | PTCH1 | RBPJ | SKP2 | TCF7 | TP53 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| EIF1 | 23 | 29 | 29 | 35 | 41 | 41 | 47 | 53 | 44 | 50 | 56 | 56 | 62 |
| EIF1AX | 29 | 30 | 37 | 44 | 36 | 43 | 50 | 51 | 49 | 56 | 57 | 64 | 71 |
| EIF2B1 | 35 | 37 | 36 | 44 | 46 | 45 | 53 | 55 | 54 | 62 | 64 | 63 | 71 |
| EIF2B2 | 35 | 44 | 44 | 47 | 47 | 56 | 50 | 59 | 59 | 62 | 71 | 71 | 74 |
| EIF2B3 | 41 | 42 | 52 | 47 | 57 | 58 | 62 | 63 | 64 | 68 | 69 | 79 | 74 |
| EIF2B4 | 47 | 49 | 45 | 56 | 58 | 54 | 65 | 67 | 63 | 74 | 76 | 72 | 83 |
| EIF2B5 | 53 | 56 | 53 | 56 | 68 | 65 | 68 | 71 | 68 | 80 | 83 | 80 | 83 |
| EIF2S1 | 59 | 57 | 61 | 65 | 63 | 67 | 71 | 69 | 73 | 86 | 84 | 88 | 92 |
| EIF2S2 | 56 | 55 | 60 | 65 | 64 | 69 | 74 | 73 | 78 | 83 | 82 | 87 | 92 |
| EIF2S3 | 56 | 62 | 68 | 68 | 74 | 80 | 80 | 86 | 83 | 83 | 89 | 95 | 95 |
| ELAVL1 | 62 | 69 | 76 | 77 | 75 | 82 | 83 | 90 | 88 | 89 | 96 | 103 | 104 |
# Create data frame for List A genes (rows) colored by List B genes (columns)
a_genes_data2 <- data.frame(
gene = rownames(result),
total_pubs = rowSums(result),
stringsAsFactors = FALSE
)
# Add color coding based on max overlap with B genes
a_genes_data2$max_b_gene <- apply(result, 1, function(x) colnames(result)[which.max(x)])
a_genes_data2$max_overlap <- apply(result, 1, max)
# Create data frame for List B genes (columns) colored by List A genes (rows)
b_genes_data2 <- data.frame(
gene = colnames(result),
total_pubs = colSums(result),
stringsAsFactors = FALSE
)
# Add color coding based on max overlap with A genes
b_genes_data2$max_a_gene <- apply(result, 2, function(x) rownames(result)[which.max(x)])
b_genes_data2$max_overlap <- apply(result, 2, max)
bar_plot_theme <- theme_minimal(base_size = 12) +
theme(
plot.title = element_text(face = "bold", size = 15),
plot.subtitle = element_text(size = 10),
axis.title = element_text(size = 11),
axis.text = element_text(size = 10),
panel.grid.major.y = element_blank(),
legend.position = "bottom",
legend.title = element_text(size = 10),
legend.text = element_text(size = 8.5),
legend.box = "vertical"
)
n_fill_a2 <- length(unique(a_genes_data2$max_b_gene))
n_fill_b2 <- length(unique(b_genes_data2$max_a_gene))
# Plot A genes colored by their strongest B gene partner
p3 <- ggplot(a_genes_data2, aes(x = reorder(gene, total_pubs), y = total_pubs, fill = max_b_gene)) +
geom_col(width = 0.75) +
coord_flip() +
labs(
title = "List A Genes by Publication Count (Asymmetric)",
subtitle = "Colored by strongest List B partner",
x = "Genes (List A)",
y = "Total Publications",
fill = "Strongest B Partner"
) +
scale_fill_viridis_d(option = "D", end = 0.9) +
guides(fill = guide_legend(nrow = max(1, ceiling(n_fill_a2 / 4)), byrow = TRUE)) +
bar_plot_theme
# Plot B genes colored by their strongest A gene partner
p4 <- ggplot(b_genes_data2, aes(x = reorder(gene, total_pubs), y = total_pubs, fill = max_a_gene)) +
geom_col(width = 0.75) +
coord_flip() +
labs(
title = "List B Genes by Publication Count (Asymmetric)",
subtitle = "Colored by strongest List A partner",
x = "Genes (List B)",
y = "Total Publications",
fill = "Strongest A Partner"
) +
scale_fill_viridis_d(option = "D", end = 0.9) +
guides(fill = guide_legend(nrow = max(1, ceiling(n_fill_b2 / 4)), byrow = TRUE)) +
bar_plot_theme
print(p3)The heatmap displays overlap percentages calculated from the publication co-occurrence counts:
plot_pubmatrix_heatmap(result,
title = "Asymmetric Lists Overlap (%)",
show_numbers = TRUE,
cellwidth = 44,
cellheight = 32,
width = 12,
height = 10
)Overlap percentage heatmap with values displayed in each cell
For a cleaner visualization without numbers, useful for presentations:
plot_pubmatrix_heatmap(result,
title = "Asymmetric Lists Co-occurrence (Clean)",
show_numbers = FALSE,
cellwidth = 44,
cellheight = 32,
width = 12,
height = 10
)Co-occurrence heatmap without numbers for better visual clarity
This vignette demonstrated:
The resulting matrices and visualizations can help identify: - Strong literature connections between gene sets - Potential research gaps (low co-occurrence pairs) - Patterns in publication trends over time - Most studied genes and their strongest literature partners
sessionInfo()
#> R version 4.5.2 (2025-10-31)
#> Platform: aarch64-apple-darwin20
#> Running under: macOS Tahoe 26.3.1
#>
#> Matrix products: default
#> BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
#>
#> locale:
#> [1] C/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
#>
#> time zone: Europe/London
#> tzcode source: internal
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] ggplot2_4.0.2 pheatmap_1.0.13 dplyr_1.2.0 kableExtra_1.4.0
#> [5] knitr_1.51 PubMatrixR_1.0.0
#>
#> loaded via a namespace (and not attached):
#> [1] gtable_0.3.6 jsonlite_2.0.0 compiler_4.5.2 tidyselect_1.2.1
#> [5] xml2_1.5.2 stringr_1.6.0 parallel_4.5.2 jquerylib_0.1.4
#> [9] systemfonts_1.3.1 scales_1.4.0 textshaping_1.0.4 yaml_2.3.12
#> [13] fastmap_1.2.0 R6_2.6.1 labeling_0.4.3 generics_0.1.4
#> [17] tibble_3.3.1 svglite_2.2.2 bslib_0.10.0 pillar_1.11.1
#> [21] RColorBrewer_1.1-3 readODS_2.3.2 rlang_1.1.7 cachem_1.1.0
#> [25] stringi_1.8.7 xfun_0.56 S7_0.2.1 sass_0.4.10
#> [29] otel_0.2.0 viridisLite_0.4.3 cli_3.6.5 withr_3.0.2
#> [33] magrittr_2.0.4 grid_4.5.2 digest_0.6.39 rstudioapi_0.18.0
#> [37] pbapply_1.7-4 lifecycle_1.0.5 vctrs_0.7.1 evaluate_1.0.5
#> [41] glue_1.8.0 farver_2.1.2 rmarkdown_2.30 tools_4.5.2
#> [45] pkgconfig_2.0.3 htmltools_0.5.9