In this first post, I will show you a simple script that performs a R package frequency analysis of my most used R packages. By doing this analysis you will know which are your most used/called R packages of a collection of R files (.R and .Rmd files). I will use my R files as an example for you.
At the end of the post, I will show you the results that I got using the R files that I have been developed from 2012 until the middle of 2018, and I also will show you the R code that I used in case you want to perform the same analysis using you own R files.
We will cover these easy steps:
1) How to read the content of all the R script files, and how to look for the words: library() or require()
2) How to extract each package xxxxx that has been called with library(xxxxx) or require(xxxxx) and store it in a table
3) Then we will aggregate the packages
4) And finally, we will plot the results
Step 1: a listing of all files
First of all, you need to retrieve all the R and Rmd files that you want to use for package frequency analysis. Copy all the files in a folder, it can contain subfolders if you want. This folder in this example (for OS X) will be: ~/choose/your/working/directory/.
Then open RStudio and set your working directory as:
|
1 2 |
setwd("~/choose/your/working/directory/") |
Then the next code will list all the files with extension “.R” or “.Rmd” inside ~/choose/your/working/directory/. All the path to these files will be stored in the vector FILES.
|
1 2 3 4 |
files_R1 <- list.files("./", pattern="*.R$", all.files=TRUE, full.names=TRUE, recursive=TRUE, include.dirs=TRUE) files_R2 <- list.files("./", pattern="*.Rmd$", all.files=TRUE, full.names=TRUE, recursive=TRUE, include.dirs=TRUE) FILES <- c(files_R1,files_R2) |
Step 2: reading the R scripts
The next piece of code will read (using a for-loop statement) all the contents of each script file, and will extract the name of the packages that have been called using library(xxx) or require(xxx). The result will be stored in the data frame WDfreq.
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
WDfreq <- list() for(i in 1:NROW(FILES)){ if(i/100 == i%/%100) cat("\nfiles loaded:",i,"/",NROW(FILES)) TEXT <- scan(FILES[i],"character",sep="\n", quiet=TRUE, encoding="UTF-8") #Split sentence WORDS <- strsplit(TEXT," ") # identifiying library & require instrucions LIB <- grep("library\\(",WORDS, value=TRUE) REQ <- grep("require\\(",WORDS, value=TRUE) ##-- library(stringr) LIB_id <- str_locate(LIB, "library\\(") REQ_id <- str_locate(REQ, "require\\(") LIB <- str_sub(LIB, start=LIB_id[,2]+1, end=str_length(LIB)) REQ <- str_sub(REQ, start=REQ_id[,2]+1, end=str_length(REQ)) ##-- LIB_id <- str_locate(LIB, "\\)") REQ_id <- str_locate(REQ, "\\)") LIB <- str_sub(LIB, start=1, end=LIB_id[,1]) REQ <- str_sub(REQ, start=1, end=REQ_id[,1]) ##-- LIB <- gsub("\\(","",LIB) LIB <- gsub("\\)","",LIB) REQ <- gsub("\\(","",REQ) REQ <- gsub("\\)","",REQ) LIB <- gsub('\\\\"',"",LIB) REQ <- gsub('\\\\"',"",REQ) LIB <- gsub('\\"',"",LIB) REQ <- gsub('\\"',"",REQ) LIB <- gsub("\\'","",LIB) REQ <- gsub("\\'","",REQ) ## # joining RESULT <- c(LIB,REQ) if(NROW(RESULT)==0) next # word frequencies WDfreq[[i]] <- table(unlist(RESULT)) WDfreq[[i]] <- data.frame(pk=names(WDfreq[[i]]),rep=as.integer(WDfreq[[i]])) # adding more columns WDfreq[[i]]$file <- basename(FILES[i]) WDfreq[[i]]$size <- file.info(FILES[i])$size WDfreq[[i]]$mtime <- file.info(FILES[i])$mtime WDfreq[[i]]$year <- format(WDfreq[[i]]$mtime, format="%Y") } |
|
1 2 3 4 5 6 |
## ## files loaded: 100 / 591 ## files loaded: 200 / 591 ## files loaded: 300 / 591 ## files loaded: 400 / 591 ## files loaded: 500 / 591 |
|
1 2 3 |
WDfreq <- do.call("rbind",WDfreq) cat("\nTotal packages:",NROW(WDfreq)) |
|
1 2 |
## ## Total packages: 2907 |
You will get a data frame showed below with these columns:
- pk: R package
- rep: number of package calls in each file
- file: R or Rmd file
- size: size of the file in bytes
- mtime: last file modification date-time
- year: last file modification year
|
1 2 |
tail(WDfreq) |
|
1 2 3 4 5 6 7 |
## pk rep file size mtime year ## 2902 raster 1 Yoga_centers.Rmd 22984 2018-08-11 10:11:01 2018 ## 2903 RColorBrewer 1 Yoga_centers.Rmd 22984 2018-08-11 10:11:01 2018 ## 2904 reshape 1 Yoga_centers.Rmd 22984 2018-08-11 10:11:01 2018 ## 2905 rgdal 1 Yoga_centers.Rmd 22984 2018-08-11 10:11:01 2018 ## 2906 RgoogleMaps 1 Yoga_centers.Rmd 22984 2018-08-11 10:11:01 2018 ## 2907 sp 1 Yoga_centers.Rmd 22984 2018-08-11 10:11:01 2018 |
Step 3: aggregating package names
Next, I will aggregate the packages within the same name (I will add up the rep column) in order to have a statistical approach to the most frequent packages I used. This can be done with the ddply function.
|
1 2 3 4 |
library(plyr) WDfreq2 <- ddply(WDfreq, .(pk), summarise, rep=sum(rep)) WDfreq2$perc <- (WDfreq2$rep/sum(WDfreq2$rep))*100 |
The result is a tidy table like:
|
1 2 |
head(WDfreq2) |
|
1 2 3 4 5 6 7 |
## pk rep perc ## 1 abind 2 0.04106776 ## 2 agricolae 4 0.08213552 ## 3 Amelia 4 0.08213552 ## 4 animation 5 0.10266940 ## 5 AppliedPredictiveModeling 8 0.16427105 ## 6 bartMachine 1 0.02053388 |
Step 4: plotting results
Now we are int funny part!!. So we have our tidy table ready to do some plots. Below you will find the code and the results using ggplot2. I will plot my TOP 50 most used packages:
|
1 2 3 4 5 6 7 8 9 |
# ploting (TOP 50) dfplot <- WDfreq2[order(WDfreq2$perc),] dfplot <- dfplot[(NROW(dfplot)-50):NROW(dfplot),] library(ggplot2) PLOT2 <- ggplot(dfplot, aes(pk, perc, fill=pk)) + geom_bar(stat="identity", alpha=0.5) PLOT2 <- PLOT2 + theme_minimal(base_size=10) + theme(legend.position="none") + coord_flip() PLOT2 <- PLOT2 + labs(title="Percentage of library calls in my R & Rmd code", x="package", y="percentage of calls (%)", subtitle="TOP 50") + scale_x_discrete(limits=dfplot$pk) PLOT2 |

Finally, we can get really beautiful results creating a circle plot, I adapted the code from basic-circle-packing-with-one-level and hide-first-level-in-circle-packing. You maybe will need to install the package packcircles, the result is showed below.
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 |
# libraries library(packcircles) library(ggplot2) # Generate the layout. # This function returns a data frame with one line per bubble. # It gives its center (x and y) and its radius, proportional to the value packing <- circleProgressiveLayout(WDfreq2$rep, sizetype='area') data <- cbind(WDfreq2, packing) # The next step is to go from one center + a radius to the coordinates of a circle that # is drawn by a multitude of straight lines. dat.gg <- circleLayoutVertices(packing, npoints=50) # Make the plot PLOT3 <- ggplot() + geom_polygon(data=dat.gg, aes(x, y, group=id, fill=as.factor(id)), colour="black", alpha=0.5) PLOT3 <- PLOT3 + geom_text(data=data, aes(x, y, size=rep, label=pk)) + scale_size_continuous(range=c(1,4)) PLOT3 <- PLOT3 + theme_void() + theme(legend.position="none") + coord_equal() PLOT3 |

Export plot
If you want to output the plot you can do:
|
1 2 3 4 |
# exporting plot3 png(filename="../circle_package_plot.png", width=2000, height=2000, pointsize=12, res=300) PLOT3 |
And that’s all, I hope you enjoyed it!!
Session Info:
|
1 2 3 |
------------------------------------ Total R execution time: 37.3 secs ------------------------------------ |
|
1 2 3 4 5 6 7 8 9 |
setting value version R version 3.4.3 (2017-11-30) os macOS High Sierra 10.13.5 system x86_64, darwin15.6.0 ui RStudio language (EN) collate es_ES.UTF-8 tz Europe/Madrid date 2018-08-16 |
|
1 2 |
------------------------------------ Packages: |
|
1 2 3 4 5 |
[1] "ggplot2 - 2.2.1 - 2016-12-30 - CRAN (R 3.4.0)" [2] "packcircles - 0.3.1 - 2018-01-09 - CRAN (R 3.4.3)" [3] "plyr - 1.8.4 - 2016-06-08 - CRAN (R 3.4.0)" [4] "RWordPress - 0.2-3 - 2018-03-04 - Github (duncantl/RWordPress@ce6d2d6)" [5] "stringr - 1.2.0 - 2017-02-18 - CRAN (R 3.4.0)" |
Appendix, all the code:
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
setwd("~/choose/your/working/directory/") files_R1 <- list.files("./", pattern = "*.R$", all.files = TRUE, full.names = TRUE, recursive = TRUE, include.dirs = TRUE) files_R2 <- list.files("./", pattern = "*.Rmd$", all.files = TRUE, full.names = TRUE, recursive = TRUE, include.dirs = TRUE) FILES <- c(files_R1, files_R2) files_R1 <- list.files("./R_code-history/", pattern = "*.R$", all.files = TRUE, full.names = TRUE, recursive = TRUE, include.dirs = TRUE) files_R2 <- list.files("./R_code-history/", pattern = "*.Rmd$", all.files = TRUE, full.names = TRUE, recursive = TRUE, include.dirs = TRUE) FILES <- c(files_R1, files_R2) WDfreq <- list() for (i in 1:NROW(FILES)) { if (i/100 == i%/%100) cat("\nfiles loaded:", i, "/", NROW(FILES)) TEXT <- scan(FILES[i], "character", sep = "\n", quiet = TRUE, encoding = "UTF-8") # Split sentence WORDS <- strsplit(TEXT, " ") # identifiying library & require instrucions LIB <- grep("library\\(", WORDS, value = TRUE) REQ <- grep("require\\(", WORDS, value = TRUE) ##-- library(stringr) LIB_id <- str_locate(LIB, "library\\(") REQ_id <- str_locate(REQ, "require\\(") LIB <- str_sub(LIB, start = LIB_id[, 2] + 1, end = str_length(LIB)) REQ <- str_sub(REQ, start = REQ_id[, 2] + 1, end = str_length(REQ)) ##-- LIB_id <- str_locate(LIB, "\\)") REQ_id <- str_locate(REQ, "\\)") LIB <- str_sub(LIB, start = 1, end = LIB_id[, 1]) REQ <- str_sub(REQ, start = 1, end = REQ_id[, 1]) ##-- LIB <- gsub("\\(", "", LIB) LIB <- gsub("\\)", "", LIB) REQ <- gsub("\\(", "", REQ) REQ <- gsub("\\)", "", REQ) LIB <- gsub("\\\\\"", "", LIB) REQ <- gsub("\\\\\"", "", REQ) LIB <- gsub("\\\"", "", LIB) REQ <- gsub("\\\"", "", REQ) LIB <- gsub("\\'", "", LIB) REQ <- gsub("\\'", "", REQ) ## # joining RESULT <- c(LIB, REQ) if (NROW(RESULT) == 0) next # word frequencies WDfreq[[i]] <- table(unlist(RESULT)) WDfreq[[i]] <- data.frame(pk = names(WDfreq[[i]]), rep = as.integer(WDfreq[[i]])) # adding more columns WDfreq[[i]]$file <- basename(FILES[i]) WDfreq[[i]]$size <- file.info(FILES[i])$size WDfreq[[i]]$mtime <- file.info(FILES[i])$mtime WDfreq[[i]]$year <- format(WDfreq[[i]]$mtime, format = "%Y") } WDfreq <- do.call("rbind", WDfreq) cat("\nTotal packages:", NROW(WDfreq)) # especific substitution (not generic, there is a particular problem on the # scan) not to show on post WDfreq$pk <- gsub(",, warn.conflicts=FALSE", "", WDfreq$pk, fixed = TRUE) tail(WDfreq) library(plyr) WDfreq2 <- ddply(WDfreq, .(pk), summarise, rep = sum(rep)) WDfreq2$perc <- (WDfreq2$rep/sum(WDfreq2$rep)) * 100 head(WDfreq2) # ploting (TOP 50) dfplot <- WDfreq2[order(WDfreq2$perc), ] dfplot <- dfplot[(NROW(dfplot) - 50):NROW(dfplot), ] library(ggplot2) PLOT2 <- ggplot(dfplot, aes(pk, perc, fill = pk)) + geom_bar(stat = "identity", alpha = 0.5) PLOT2 <- PLOT2 + theme_minimal(base_size = 10) + theme(legend.position = "none") + coord_flip() PLOT2 <- PLOT2 + labs(title = "Percentage of library calls in my R & Rmd code", x = "package", y = "percentage of calls (%)", subtitle = "TOP 50") + scale_x_discrete(limits = dfplot$pk) PLOT2 # libraries library(packcircles) library(ggplot2) # Generate the layout. This function returns a data frame with one line per # bubble. It gives its center (x and y) and its radius, proportional to the # value packing <- circleProgressiveLayout(WDfreq2$rep, sizetype = "area") data <- cbind(WDfreq2, packing) # The next step is to go from one center + a radius to the coordinates of a # circle that is drawn by a multitude of straight lines. dat.gg <- circleLayoutVertices(packing, npoints = 50) # Make the plot PLOT3 <- ggplot() + geom_polygon(data = dat.gg, aes(x, y, group = id, fill = as.factor(id)), colour = "black", alpha = 0.5) PLOT3 <- PLOT3 + geom_text(data = data, aes(x, y, size = rep, label = pk)) + scale_size_continuous(range = c(1, 4)) PLOT3 <- PLOT3 + theme_void() + theme(legend.position = "none") + coord_equal() PLOT3 # exporting plot3 png(filename = "../circle_package_plot.png", width = 2000, height = 2000, pointsize = 12, res = 300) PLOT3 |
