In this first post, I will show you a simple script that performs a R package frequency analysis of my most used R packages. By doing this analysis you will know which are your most used/called R packages of a collection of R files (.R and .Rmd files). I will use my R files as an example for you.

At the end of the post, I will show you the results that I got using the R files that I have been developed from 2012 until the middle of 2018, and I also will show you the R code that I used in case you want to perform the same analysis using you own R files.

We will cover these easy steps:

1) How to read the content of all the R script files, and how to look for the words: library() or require()
2) How to extract each package xxxxx that has been called with library(xxxxx) or require(xxxxx) and store it in a table
3) Then we will aggregate the packages
4) And finally, we will plot the results

Step 1: a listing of all files

First of all, you need to retrieve all the R and Rmd files that you want to use for package frequency analysis. Copy all the files in a folder, it can contain subfolders if you want. This folder in this example (for OS X) will be: ~/choose/your/working/directory/.

Then open RStudio and set your working directory as:

setwd("~/choose/your/working/directory/")

1 2	setwd("~/choose/your/working/directory/")

Then the next code will list all the files with extension “.R” or “.Rmd” inside ~/choose/your/working/directory/. All the path to these files will be stored in the vector FILES.

files_R1 <- list.files("./", pattern="*.R$", all.files=TRUE, full.names=TRUE, recursive=TRUE, include.dirs=TRUE)
files_R2 <- list.files("./", pattern="*.Rmd$", all.files=TRUE, full.names=TRUE, recursive=TRUE, include.dirs=TRUE)
FILES <- c(files_R1,files_R2)

files_R1 <- list.files("./", pattern="*.R$", all.files=TRUE, full.names=TRUE, recursive=TRUE, include.dirs=TRUE)

files_R2 <- list.files("./", pattern="*.Rmd$", all.files=TRUE, full.names=TRUE, recursive=TRUE, include.dirs=TRUE)

FILES <- c(files_R1,files_R2)

Step 2: reading the R scripts

The next piece of code will read (using a for-loop statement) all the contents of each script file, and will extract the name of the packages that have been called using library(xxx) or require(xxx). The result will be stored in the data frame WDfreq.

WDfreq <- list()
for(i in 1:NROW(FILES)){
    if(i/100 == i%/%100) cat("\nfiles loaded:",i,"/",NROW(FILES))
    TEXT <- scan(FILES[i],"character",sep="\n", quiet=TRUE, encoding="UTF-8")

    #Split sentence
    WORDS <- strsplit(TEXT," ")

    # identifiying library & require instrucions
    LIB <- grep("library\\(",WORDS, value=TRUE)
    REQ <- grep("require\\(",WORDS, value=TRUE)
    ##--
    library(stringr)
    LIB_id <- str_locate(LIB, "library\\(")
    REQ_id <- str_locate(REQ, "require\\(")
    LIB <- str_sub(LIB, start=LIB_id[,2]+1, end=str_length(LIB))
    REQ <- str_sub(REQ, start=REQ_id[,2]+1, end=str_length(REQ))
    ##--
    LIB_id <- str_locate(LIB, "\\)")
    REQ_id <- str_locate(REQ, "\\)")
    LIB <- str_sub(LIB, start=1, end=LIB_id[,1])
    REQ <- str_sub(REQ, start=1, end=REQ_id[,1])
    ##--
    LIB <- gsub("\\(","",LIB)
    LIB <- gsub("\\)","",LIB)
    REQ <- gsub("\\(","",REQ)
    REQ <- gsub("\\)","",REQ)
    LIB <- gsub('\\\\"',"",LIB)
    REQ <- gsub('\\\\"',"",REQ)
    LIB <- gsub('\\"',"",LIB)
    REQ <- gsub('\\"',"",REQ)
    LIB <- gsub("\\'","",LIB)
    REQ <- gsub("\\'","",REQ)
    ##

    # joining
    RESULT <- c(LIB,REQ)
    if(NROW(RESULT)==0) next

    # word frequencies
    WDfreq[[i]] <- table(unlist(RESULT))
    WDfreq[[i]] <- data.frame(pk=names(WDfreq[[i]]),rep=as.integer(WDfreq[[i]]))

    # adding more columns
    WDfreq[[i]]$file <- basename(FILES[i])
    WDfreq[[i]]$size <- file.info(FILES[i])$size
    WDfreq[[i]]$mtime <- file.info(FILES[i])$mtime
    WDfreq[[i]]$year <- format(WDfreq[[i]]$mtime, format="%Y")

}

WDfreq <- list()

for(i in 1:NROW(FILES)){

if(i/100 == i%/%100) cat("\nfiles loaded:",i,"/",NROW(FILES))

TEXT <- scan(FILES[i],"character",sep="\n", quiet=TRUE, encoding="UTF-8")

#Split sentence

WORDS <- strsplit(TEXT," ")

# identifiying library & require instrucions

LIB <- grep("library\\(",WORDS, value=TRUE)

REQ <- grep("require\\(",WORDS, value=TRUE)

##--

library(stringr)

LIB_id <- str_locate(LIB, "library\\(")

REQ_id <- str_locate(REQ, "require\\(")

LIB <- str_sub(LIB, start=LIB_id[,2]+1, end=str_length(LIB))

REQ <- str_sub(REQ, start=REQ_id[,2]+1, end=str_length(REQ))

##--

LIB_id <- str_locate(LIB, "\\)")

REQ_id <- str_locate(REQ, "\\)")

LIB <- str_sub(LIB, start=1, end=LIB_id[,1])

REQ <- str_sub(REQ, start=1, end=REQ_id[,1])

##--

LIB <- gsub("\\(","",LIB)

LIB <- gsub("\\)","",LIB)

REQ <- gsub("\\(","",REQ)

REQ <- gsub("\\)","",REQ)

LIB <- gsub('\\\\"',"",LIB)

REQ <- gsub('\\\\"',"",REQ)

LIB <- gsub('\\"',"",LIB)

REQ <- gsub('\\"',"",REQ)

LIB <- gsub("\\'","",LIB)

REQ <- gsub("\\'","",REQ)

# joining

RESULT <- c(LIB,REQ)

if(NROW(RESULT)==0) next

# word frequencies

WDfreq[[i]] <- table(unlist(RESULT))

WDfreq[[i]] <- data.frame(pk=names(WDfreq[[i]]),rep=as.integer(WDfreq[[i]]))

# adding more columns

WDfreq[[i]]$file <- basename(FILES[i])

WDfreq[[i]]$size <- file.info(FILES[i])$size

WDfreq[[i]]$mtime <- file.info(FILES[i])$mtime

WDfreq[[i]]$year <- format(WDfreq[[i]]$mtime, format="%Y")

}

## 
## files loaded: 100 / 591
## files loaded: 200 / 591
## files loaded: 300 / 591
## files loaded: 400 / 591
## files loaded: 500 / 591

## files loaded: 100 / 591

## files loaded: 200 / 591

## files loaded: 300 / 591

## files loaded: 400 / 591

## files loaded: 500 / 591

WDfreq <- do.call("rbind",WDfreq)
cat("\nTotal packages:",NROW(WDfreq))

WDfreq <- do.call("rbind",WDfreq)

cat("\nTotal packages:",NROW(WDfreq))

## 
## Total packages: 2907

1 2	## ## Total packages: 2907

You will get a data frame showed below with these columns:

pk: R package
rep: number of package calls in each file
file: R or Rmd file
size: size of the file in bytes
mtime: last file modification date-time
year: last file modification year

tail(WDfreq)

1 2	tail(WDfreq)

##                pk rep             file  size               mtime year
## 2902       raster   1 Yoga_centers.Rmd 22984 2018-08-11 10:11:01 2018
## 2903 RColorBrewer   1 Yoga_centers.Rmd 22984 2018-08-11 10:11:01 2018
## 2904      reshape   1 Yoga_centers.Rmd 22984 2018-08-11 10:11:01 2018
## 2905        rgdal   1 Yoga_centers.Rmd 22984 2018-08-11 10:11:01 2018
## 2906  RgoogleMaps   1 Yoga_centers.Rmd 22984 2018-08-11 10:11:01 2018
## 2907           sp   1 Yoga_centers.Rmd 22984 2018-08-11 10:11:01 2018

## pk rep file size mtime year

## 2902 raster 1 Yoga_centers.Rmd 22984 2018-08-11 10:11:01 2018

## 2903 RColorBrewer 1 Yoga_centers.Rmd 22984 2018-08-11 10:11:01 2018

## 2904 reshape 1 Yoga_centers.Rmd 22984 2018-08-11 10:11:01 2018

## 2905 rgdal 1 Yoga_centers.Rmd 22984 2018-08-11 10:11:01 2018

## 2906 RgoogleMaps 1 Yoga_centers.Rmd 22984 2018-08-11 10:11:01 2018

## 2907 sp 1 Yoga_centers.Rmd 22984 2018-08-11 10:11:01 2018

Step 3: aggregating package names

Next, I will aggregate the packages within the same name (I will add up the rep column) in order to have a statistical approach to the most frequent packages I used. This can be done with the ddply function.

library(plyr)
WDfreq2 <- ddply(WDfreq, .(pk), summarise, rep=sum(rep))
WDfreq2$perc <- (WDfreq2$rep/sum(WDfreq2$rep))*100

library(plyr)

WDfreq2 <- ddply(WDfreq, .(pk), summarise, rep=sum(rep))

WDfreq2$perc <- (WDfreq2$rep/sum(WDfreq2$rep))*100

The result is a tidy table like:

head(WDfreq2)

1 2	head(WDfreq2)

##                          pk rep       perc
## 1                     abind   2 0.04106776
## 2                 agricolae   4 0.08213552
## 3                    Amelia   4 0.08213552
## 4                 animation   5 0.10266940
## 5 AppliedPredictiveModeling   8 0.16427105
## 6               bartMachine   1 0.02053388

## pk rep perc

## 1 abind 2 0.04106776

## 2 agricolae 4 0.08213552

## 3 Amelia 4 0.08213552

## 4 animation 5 0.10266940

## 5 AppliedPredictiveModeling 8 0.16427105

## 6 bartMachine 1 0.02053388

Step 4: plotting results

Now we are int funny part!!. So we have our tidy table ready to do some plots. Below you will find the code and the results using ggplot2. I will plot my TOP 50 most used packages:

# ploting (TOP 50)
dfplot <- WDfreq2[order(WDfreq2$perc),]
dfplot <- dfplot[(NROW(dfplot)-50):NROW(dfplot),]
library(ggplot2)
PLOT2 <- ggplot(dfplot, aes(pk, perc, fill=pk)) + geom_bar(stat="identity", alpha=0.5)
PLOT2 <- PLOT2 + theme_minimal(base_size=10) + theme(legend.position="none") + coord_flip()
PLOT2 <- PLOT2 + labs(title="Percentage of library calls in my R & Rmd code", x="package", y="percentage of calls (%)", subtitle="TOP 50") + scale_x_discrete(limits=dfplot$pk)
PLOT2

# ploting (TOP 50)

dfplot <- WDfreq2[order(WDfreq2$perc),]

dfplot <- dfplot[(NROW(dfplot)-50):NROW(dfplot),]

library(ggplot2)

PLOT2 <- ggplot(dfplot, aes(pk, perc, fill=pk)) + geom_bar(stat="identity", alpha=0.5)

PLOT2 <- PLOT2 + theme_minimal(base_size=10) + theme(legend.position="none") + coord_flip()

PLOT2 <- PLOT2 + labs(title="Percentage of library calls in my R & Rmd code", x="package", y="percentage of calls (%)", subtitle="TOP 50") + scale_x_discrete(limits=dfplot$pk)

PLOT2

Finally, we can get really beautiful results creating a circle plot, I adapted the code from basic-circle-packing-with-one-level and hide-first-level-in-circle-packing. You maybe will need to install the package packcircles, the result is showed below.

# libraries
library(packcircles)
library(ggplot2)

# Generate the layout.
# This function returns a data frame with one line per bubble. 
# It gives its center (x and y) and its radius, proportional to the value
packing <- circleProgressiveLayout(WDfreq2$rep, sizetype='area')
data <- cbind(WDfreq2, packing)

# The next step is to go from one center + a radius to the coordinates of a circle that
# is drawn by a multitude of straight lines.
dat.gg <- circleLayoutVertices(packing, npoints=50)

# Make the plot
PLOT3 <- ggplot() + geom_polygon(data=dat.gg, aes(x, y, group=id, fill=as.factor(id)), colour="black", alpha=0.5)
PLOT3 <- PLOT3 + geom_text(data=data, aes(x, y, size=rep, label=pk)) + scale_size_continuous(range=c(1,4))
PLOT3 <- PLOT3 + theme_void() + theme(legend.position="none") + coord_equal()
PLOT3

# libraries

library(packcircles)

library(ggplot2)

# Generate the layout.

# This function returns a data frame with one line per bubble.

# It gives its center (x and y) and its radius, proportional to the value

packing <- circleProgressiveLayout(WDfreq2$rep, sizetype='area')

data <- cbind(WDfreq2, packing)

# The next step is to go from one center + a radius to the coordinates of a circle that

# is drawn by a multitude of straight lines.

dat.gg <- circleLayoutVertices(packing, npoints=50)

# Make the plot

PLOT3 <- ggplot() + geom_polygon(data=dat.gg, aes(x, y, group=id, fill=as.factor(id)), colour="black", alpha=0.5)

PLOT3 <- PLOT3 + geom_text(data=data, aes(x, y, size=rep, label=pk)) + scale_size_continuous(range=c(1,4))

PLOT3 <- PLOT3 + theme_void() + theme(legend.position="none") + coord_equal()

PLOT3

Export plot

If you want to output the plot you can do:

# exporting plot3
png(filename="../circle_package_plot.png", width=2000, height=2000, pointsize=12, res=300)
PLOT3

# exporting plot3

png(filename="../circle_package_plot.png", width=2000, height=2000, pointsize=12, res=300)

PLOT3

And that’s all, I hope you enjoyed it!!

Session Info:

------------------------------------
Total R execution time:  37.3 secs 
------------------------------------

------------------------------------

Total R execution time: 37.3 secs

------------------------------------

 setting  value                       
 version  R version 3.4.3 (2017-11-30)
 os       macOS High Sierra 10.13.5   
 system   x86_64, darwin15.6.0        
 ui       RStudio                     
 language (EN)                        
 collate  es_ES.UTF-8                 
 tz       Europe/Madrid               
 date     2018-08-16

setting value

version R version 3.4.3 (2017-11-30)

os macOS High Sierra 10.13.5

system x86_64, darwin15.6.0

ui RStudio

language (EN)

collate es_ES.UTF-8

tz Europe/Madrid

date 2018-08-16

------------------------------------
Packages:

1 2	------------------------------------ Packages:

[1] "ggplot2 - 2.2.1 - 2016-12-30 - CRAN (R 3.4.0)"                         
[2] "packcircles - 0.3.1 - 2018-01-09 - CRAN (R 3.4.3)"                     
[3] "plyr - 1.8.4 - 2016-06-08 - CRAN (R 3.4.0)"                            
[4] "RWordPress - 0.2-3 - 2018-03-04 - Github (duncantl/RWordPress@ce6d2d6)"
[5] "stringr - 1.2.0 - 2017-02-18 - CRAN (R 3.4.0)"

[1] "ggplot2 - 2.2.1 - 2016-12-30 - CRAN (R 3.4.0)"

[2] "packcircles - 0.3.1 - 2018-01-09 - CRAN (R 3.4.3)"

[3] "plyr - 1.8.4 - 2016-06-08 - CRAN (R 3.4.0)"

[4] "RWordPress - 0.2-3 - 2018-03-04 - Github (duncantl/RWordPress@ce6d2d6)"

[5] "stringr - 1.2.0 - 2017-02-18 - CRAN (R 3.4.0)"

Appendix, all the code:

setwd("~/choose/your/working/directory/")
files_R1 <- list.files("./", pattern = "*.R$", all.files = TRUE, full.names = TRUE, 
    recursive = TRUE, include.dirs = TRUE)
files_R2 <- list.files("./", pattern = "*.Rmd$", all.files = TRUE, full.names = TRUE, 
    recursive = TRUE, include.dirs = TRUE)
FILES <- c(files_R1, files_R2)
files_R1 <- list.files("./R_code-history/", pattern = "*.R$", all.files = TRUE, 
    full.names = TRUE, recursive = TRUE, include.dirs = TRUE)
files_R2 <- list.files("./R_code-history/", pattern = "*.Rmd$", all.files = TRUE, 
    full.names = TRUE, recursive = TRUE, include.dirs = TRUE)
FILES <- c(files_R1, files_R2)
WDfreq <- list()
for (i in 1:NROW(FILES)) {
    if (i/100 == i%/%100) 
        cat("\nfiles loaded:", i, "/", NROW(FILES))
    TEXT <- scan(FILES[i], "character", sep = "\n", quiet = TRUE, encoding = "UTF-8")

    # Split sentence
    WORDS <- strsplit(TEXT, " ")

    # identifiying library & require instrucions
    LIB <- grep("library\\(", WORDS, value = TRUE)
    REQ <- grep("require\\(", WORDS, value = TRUE)
    ##--
    library(stringr)
    LIB_id <- str_locate(LIB, "library\\(")
    REQ_id <- str_locate(REQ, "require\\(")
    LIB <- str_sub(LIB, start = LIB_id[, 2] + 1, end = str_length(LIB))
    REQ <- str_sub(REQ, start = REQ_id[, 2] + 1, end = str_length(REQ))
    ##--
    LIB_id <- str_locate(LIB, "\\)")
    REQ_id <- str_locate(REQ, "\\)")
    LIB <- str_sub(LIB, start = 1, end = LIB_id[, 1])
    REQ <- str_sub(REQ, start = 1, end = REQ_id[, 1])
    ##--
    LIB <- gsub("\\(", "", LIB)
    LIB <- gsub("\\)", "", LIB)
    REQ <- gsub("\\(", "", REQ)
    REQ <- gsub("\\)", "", REQ)
    LIB <- gsub("\\\\\"", "", LIB)
    REQ <- gsub("\\\\\"", "", REQ)
    LIB <- gsub("\\\"", "", LIB)
    REQ <- gsub("\\\"", "", REQ)
    LIB <- gsub("\\'", "", LIB)
    REQ <- gsub("\\'", "", REQ)
    ## 

    # joining
    RESULT <- c(LIB, REQ)
    if (NROW(RESULT) == 0) 
        next

    # word frequencies
    WDfreq[[i]] <- table(unlist(RESULT))
    WDfreq[[i]] <- data.frame(pk = names(WDfreq[[i]]), rep = as.integer(WDfreq[[i]]))

    # adding more columns
    WDfreq[[i]]$file <- basename(FILES[i])
    WDfreq[[i]]$size <- file.info(FILES[i])$size
    WDfreq[[i]]$mtime <- file.info(FILES[i])$mtime
    WDfreq[[i]]$year <- format(WDfreq[[i]]$mtime, format = "%Y")

}
WDfreq <- do.call("rbind", WDfreq)
cat("\nTotal packages:", NROW(WDfreq))
# especific substitution (not generic, there is a particular problem on the
# scan) not to show on post
WDfreq$pk <- gsub(",, warn.conflicts=FALSE", "", WDfreq$pk, fixed = TRUE)
tail(WDfreq)
library(plyr)
WDfreq2 <- ddply(WDfreq, .(pk), summarise, rep = sum(rep))
WDfreq2$perc <- (WDfreq2$rep/sum(WDfreq2$rep)) * 100
head(WDfreq2)
# ploting (TOP 50)
dfplot <- WDfreq2[order(WDfreq2$perc), ]
dfplot <- dfplot[(NROW(dfplot) - 50):NROW(dfplot), ]
library(ggplot2)
PLOT2 <- ggplot(dfplot, aes(pk, perc, fill = pk)) + geom_bar(stat = "identity", 
    alpha = 0.5)
PLOT2 <- PLOT2 + theme_minimal(base_size = 10) + theme(legend.position = "none") + 
    coord_flip()
PLOT2 <- PLOT2 + labs(title = "Percentage of library calls in my R & Rmd code", 
    x = "package", y = "percentage of calls (%)", subtitle = "TOP 50") + scale_x_discrete(limits = dfplot$pk)
PLOT2
# libraries
library(packcircles)
library(ggplot2)

# Generate the layout.  This function returns a data frame with one line per
# bubble.  It gives its center (x and y) and its radius, proportional to the
# value
packing <- circleProgressiveLayout(WDfreq2$rep, sizetype = "area")
data <- cbind(WDfreq2, packing)

# The next step is to go from one center + a radius to the coordinates of a
# circle that is drawn by a multitude of straight lines.
dat.gg <- circleLayoutVertices(packing, npoints = 50)

# Make the plot
PLOT3 <- ggplot() + geom_polygon(data = dat.gg, aes(x, y, group = id, fill = as.factor(id)), 
    colour = "black", alpha = 0.5)
PLOT3 <- PLOT3 + geom_text(data = data, aes(x, y, size = rep, label = pk)) + 
    scale_size_continuous(range = c(1, 4))
PLOT3 <- PLOT3 + theme_void() + theme(legend.position = "none") + coord_equal()
PLOT3
# exporting plot3
png(filename = "../circle_package_plot.png", width = 2000, height = 2000, pointsize = 12, 
    res = 300)
PLOT3

100

101

102

103

104

105

106

107

108

109

110

setwd("~/choose/your/working/directory/")

files_R1 <- list.files("./", pattern = "*.R$", all.files = TRUE, full.names = TRUE,

recursive = TRUE, include.dirs = TRUE)

files_R2 <- list.files("./", pattern = "*.Rmd$", all.files = TRUE, full.names = TRUE,

recursive = TRUE, include.dirs = TRUE)

FILES <- c(files_R1, files_R2)

files_R1 <- list.files("./R_code-history/", pattern = "*.R$", all.files = TRUE,

full.names = TRUE, recursive = TRUE, include.dirs = TRUE)

files_R2 <- list.files("./R_code-history/", pattern = "*.Rmd$", all.files = TRUE,

full.names = TRUE, recursive = TRUE, include.dirs = TRUE)

FILES <- c(files_R1, files_R2)

WDfreq <- list()

for (i in 1:NROW(FILES)) {

if (i/100 == i%/%100)

cat("\nfiles loaded:", i, "/", NROW(FILES))

TEXT <- scan(FILES[i], "character", sep = "\n", quiet = TRUE, encoding = "UTF-8")

# Split sentence

WORDS <- strsplit(TEXT, " ")

# identifiying library & require instrucions

LIB <- grep("library\\(", WORDS, value = TRUE)

REQ <- grep("require\\(", WORDS, value = TRUE)

##--

library(stringr)

LIB_id <- str_locate(LIB, "library\\(")

REQ_id <- str_locate(REQ, "require\\(")

LIB <- str_sub(LIB, start = LIB_id[, 2] + 1, end = str_length(LIB))

REQ <- str_sub(REQ, start = REQ_id[, 2] + 1, end = str_length(REQ))

##--

LIB_id <- str_locate(LIB, "\\)")

REQ_id <- str_locate(REQ, "\\)")

LIB <- str_sub(LIB, start = 1, end = LIB_id[, 1])

REQ <- str_sub(REQ, start = 1, end = REQ_id[, 1])

##--

LIB <- gsub("\\(", "", LIB)

LIB <- gsub("\\)", "", LIB)

REQ <- gsub("\\(", "", REQ)

REQ <- gsub("\\)", "", REQ)

LIB <- gsub("\\\\\"", "", LIB)

REQ <- gsub("\\\\\"", "", REQ)

LIB <- gsub("\\\"", "", LIB)

REQ <- gsub("\\\"", "", REQ)

LIB <- gsub("\\'", "", LIB)

REQ <- gsub("\\'", "", REQ)

# joining

RESULT <- c(LIB, REQ)

if (NROW(RESULT) == 0)

# word frequencies

WDfreq[[i]] <- table(unlist(RESULT))

WDfreq[[i]] <- data.frame(pk = names(WDfreq[[i]]), rep = as.integer(WDfreq[[i]]))

# adding more columns

WDfreq[[i]]$file <- basename(FILES[i])

WDfreq[[i]]$size <- file.info(FILES[i])$size

WDfreq[[i]]$mtime <- file.info(FILES[i])$mtime

WDfreq[[i]]$year <- format(WDfreq[[i]]$mtime, format = "%Y")

}

WDfreq <- do.call("rbind", WDfreq)

cat("\nTotal packages:", NROW(WDfreq))

# especific substitution (not generic, there is a particular problem on the

# scan) not to show on post

WDfreq$pk <- gsub(",, warn.conflicts=FALSE", "", WDfreq$pk, fixed = TRUE)

tail(WDfreq)

library(plyr)

WDfreq2 <- ddply(WDfreq, .(pk), summarise, rep = sum(rep))

WDfreq2$perc <- (WDfreq2$rep/sum(WDfreq2$rep)) * 100

head(WDfreq2)

# ploting (TOP 50)

dfplot <- WDfreq2[order(WDfreq2$perc), ]

dfplot <- dfplot[(NROW(dfplot) - 50):NROW(dfplot), ]

library(ggplot2)

PLOT2 <- ggplot(dfplot, aes(pk, perc, fill = pk)) + geom_bar(stat = "identity",

alpha = 0.5)

PLOT2 <- PLOT2 + theme_minimal(base_size = 10) + theme(legend.position = "none") +

coord_flip()

PLOT2 <- PLOT2 + labs(title = "Percentage of library calls in my R & Rmd code",

x = "package", y = "percentage of calls (%)", subtitle = "TOP 50") + scale_x_discrete(limits = dfplot$pk)

PLOT2

# libraries

library(packcircles)

library(ggplot2)

# Generate the layout. This function returns a data frame with one line per

# bubble. It gives its center (x and y) and its radius, proportional to the

# value

packing <- circleProgressiveLayout(WDfreq2$rep, sizetype = "area")

data <- cbind(WDfreq2, packing)

# The next step is to go from one center + a radius to the coordinates of a

# circle that is drawn by a multitude of straight lines.

dat.gg <- circleLayoutVertices(packing, npoints = 50)

# Make the plot

PLOT3 <- ggplot() + geom_polygon(data = dat.gg, aes(x, y, group = id, fill = as.factor(id)),

colour = "black", alpha = 0.5)

PLOT3 <- PLOT3 + geom_text(data = data, aes(x, y, size = rep, label = pk)) +

scale_size_continuous(range = c(1, 4))

PLOT3 <- PLOT3 + theme_void() + theme(legend.position = "none") + coord_equal()

PLOT3

# exporting plot3

png(filename = "../circle_package_plot.png", width = 2000, height = 2000, pointsize = 12,

res = 300)

PLOT3

Share it!:

a blog about data science

Plotting proportionally sized circles about the frequency of usage of R packages

Step 1: a listing of all files

Step 2: reading the R scripts

Step 3: aggregating package names

Step 4: plotting results

Export plot

Leave a Reply Cancel reply