En este primer post, te mostraré un sencillo script que realiza un análisis de frecuencia de los paquetes de R que he usado con mayor frecuencia. Haciendo este análisis sabrás cuales son tus paquetes de R más usados (called) de todo un conjunto de archivos de R (archivos .R y .Rmd). Usaré mi historial de archivos y scripts de R como ejemplo.

Al final del post, podrás ver lo resultados que he podido obtener usando los archivos R que he desarrollado desde 2012 hasta mediados de 2018, también te mostraré el código R que usé en caso de que quieras realizar el mismo análisis usando tus propios archivos.

Cubriremos estos sencillos pasos:

1) Cómo leer el contenido de todos los archivos de script R, y cómo buscar las palabras: library() o require()
2) Cómo extraer cada paquete xxxxx que ha sido llamado con library(xxxxx) o require(xxxxx) y guardarlo en una tabla
3) Luego agregaremos los paquetes
4) Y finalmente, pintaremos los resultados

Paso 1: listando todos los archivos

En primer lugar, hay que recuperar todos los archivos R y Rmd que quieras utilizar para el análisis de frecuencia de uso de paquetes. Para ello copiar todos los archivos en una carpeta, puede contener subcarpetas si lo deseas. Esta carpeta en este ejemplo (para OS X) será: ~/choose/your/working/directory/.

Luego abre RStudio y configura tu directorio de trabajo así:

setwd("~/choose/your/working/directory/")

1 2	setwd("~/choose/your/working/directory/")

Entonces el siguiente código listará todos los archivos que tengan la extensión “.R” o “.Rmd” ~/choose/your/working/directory/. Toda la ruta a estos archivos se almacenará en el vector FILES.

files_R1 <- list.files("./", pattern="*.R$", all.files=TRUE, full.names=TRUE, recursive=TRUE, include.dirs=TRUE)
files_R2 <- list.files("./", pattern="*.Rmd$", all.files=TRUE, full.names=TRUE, recursive=TRUE, include.dirs=TRUE)
FILES <- c(files_R1,files_R2)

files_R1 <- list.files("./", pattern="*.R$", all.files=TRUE, full.names=TRUE, recursive=TRUE, include.dirs=TRUE)

files_R2 <- list.files("./", pattern="*.Rmd$", all.files=TRUE, full.names=TRUE, recursive=TRUE, include.dirs=TRUE)

FILES <- c(files_R1,files_R2)

Paso 2: leyendo los archivos de R

El siguiente código leerá (usando una sentencia for-loop) todo el contenido de cada archivo o script, y extraerá el nombre de los paquetes que han sido llamados usando library(xxx) o require(xxx). El resultado se almacenará en el marco de datos WDfreq.

WDfreq <- list()
for(i in 1:NROW(FILES)){
    if(i/100 == i%/%100) cat("\nfiles loaded:",i,"/",NROW(FILES))
    TEXT <- scan(FILES[i],"character",sep="\n", quiet=TRUE, encoding="UTF-8")

    #Split sentence
    WORDS <- strsplit(TEXT," ")

    # identifiying library & require instrucions
    LIB <- grep("library\\(",WORDS, value=TRUE)
    REQ <- grep("require\\(",WORDS, value=TRUE)
    ##--
    library(stringr)
    LIB_id <- str_locate(LIB, "library\\(")
    REQ_id <- str_locate(REQ, "require\\(")
    LIB <- str_sub(LIB, start=LIB_id[,2]+1, end=str_length(LIB))
    REQ <- str_sub(REQ, start=REQ_id[,2]+1, end=str_length(REQ))
    ##--
    LIB_id <- str_locate(LIB, "\\)")
    REQ_id <- str_locate(REQ, "\\)")
    LIB <- str_sub(LIB, start=1, end=LIB_id[,1])
    REQ <- str_sub(REQ, start=1, end=REQ_id[,1])
    ##--
    LIB <- gsub("\\(","",LIB)
    LIB <- gsub("\\)","",LIB)
    REQ <- gsub("\\(","",REQ)
    REQ <- gsub("\\)","",REQ)
    LIB <- gsub('\\\\"',"",LIB)
    REQ <- gsub('\\\\"',"",REQ)
    LIB <- gsub('\\"',"",LIB)
    REQ <- gsub('\\"',"",REQ)
    LIB <- gsub("\\'","",LIB)
    REQ <- gsub("\\'","",REQ)
    ##

    # joining
    RESULT <- c(LIB,REQ)
    if(NROW(RESULT)==0) next

    # word frequencies
    WDfreq[[i]] <- table(unlist(RESULT))
    WDfreq[[i]] <- data.frame(pk=names(WDfreq[[i]]),rep=as.integer(WDfreq[[i]]))

    # adding more columns
    WDfreq[[i]]$file <- basename(FILES[i])
    WDfreq[[i]]$size <- file.info(FILES[i])$size
    WDfreq[[i]]$mtime <- file.info(FILES[i])$mtime
    WDfreq[[i]]$year <- format(WDfreq[[i]]$mtime, format="%Y")

}

WDfreq <- list()

for(i in 1:NROW(FILES)){

if(i/100 == i%/%100) cat("\nfiles loaded:",i,"/",NROW(FILES))

TEXT <- scan(FILES[i],"character",sep="\n", quiet=TRUE, encoding="UTF-8")

#Split sentence

WORDS <- strsplit(TEXT," ")

# identifiying library & require instrucions

LIB <- grep("library\\(",WORDS, value=TRUE)

REQ <- grep("require\\(",WORDS, value=TRUE)

##--

library(stringr)

LIB_id <- str_locate(LIB, "library\\(")

REQ_id <- str_locate(REQ, "require\\(")

LIB <- str_sub(LIB, start=LIB_id[,2]+1, end=str_length(LIB))

REQ <- str_sub(REQ, start=REQ_id[,2]+1, end=str_length(REQ))

##--

LIB_id <- str_locate(LIB, "\\)")

REQ_id <- str_locate(REQ, "\\)")

LIB <- str_sub(LIB, start=1, end=LIB_id[,1])

REQ <- str_sub(REQ, start=1, end=REQ_id[,1])

##--

LIB <- gsub("\\(","",LIB)

LIB <- gsub("\\)","",LIB)

REQ <- gsub("\\(","",REQ)

REQ <- gsub("\\)","",REQ)

LIB <- gsub('\\\\"',"",LIB)

REQ <- gsub('\\\\"',"",REQ)

LIB <- gsub('\\"',"",LIB)

REQ <- gsub('\\"',"",REQ)

LIB <- gsub("\\'","",LIB)

REQ <- gsub("\\'","",REQ)

# joining

RESULT <- c(LIB,REQ)

if(NROW(RESULT)==0) next

# word frequencies

WDfreq[[i]] <- table(unlist(RESULT))

WDfreq[[i]] <- data.frame(pk=names(WDfreq[[i]]),rep=as.integer(WDfreq[[i]]))

# adding more columns

WDfreq[[i]]$file <- basename(FILES[i])

WDfreq[[i]]$size <- file.info(FILES[i])$size

WDfreq[[i]]$mtime <- file.info(FILES[i])$mtime

WDfreq[[i]]$year <- format(WDfreq[[i]]$mtime, format="%Y")

}

## 
## files loaded: 100 / 591
## files loaded: 200 / 591
## files loaded: 300 / 591
## files loaded: 400 / 591
## files loaded: 500 / 591

## files loaded: 100 / 591

## files loaded: 200 / 591

## files loaded: 300 / 591

## files loaded: 400 / 591

## files loaded: 500 / 591

WDfreq <- do.call("rbind",WDfreq)
cat("\nTotal packages:",NROW(WDfreq))

WDfreq <- do.call("rbind",WDfreq)

cat("\nTotal packages:",NROW(WDfreq))

## 
## Total packages: 2907

1 2	## ## Total packages: 2907

Obtendrás un dataframe donde se muestran las siguientes columnas:

pk: Paquete R
rep: número de llamadas (calls) a los paquetes en cada archivo
file: R or Rmd nombre de archivo
size: tamaño del archivo en bytes
mtime: fecha y hora de la última modificación del archivo
year: año de la última modificación del archivo

tail(WDfreq)

1 2	tail(WDfreq)

##                pk rep             file  size               mtime year
## 2902       raster   1 Yoga_centers.Rmd 22984 2018-08-11 10:11:01 2018
## 2903 RColorBrewer   1 Yoga_centers.Rmd 22984 2018-08-11 10:11:01 2018
## 2904      reshape   1 Yoga_centers.Rmd 22984 2018-08-11 10:11:01 2018
## 2905        rgdal   1 Yoga_centers.Rmd 22984 2018-08-11 10:11:01 2018
## 2906  RgoogleMaps   1 Yoga_centers.Rmd 22984 2018-08-11 10:11:01 2018
## 2907           sp   1 Yoga_centers.Rmd 22984 2018-08-11 10:11:01 2018

## pk rep file size mtime year

## 2902 raster 1 Yoga_centers.Rmd 22984 2018-08-11 10:11:01 2018

## 2903 RColorBrewer 1 Yoga_centers.Rmd 22984 2018-08-11 10:11:01 2018

## 2904 reshape 1 Yoga_centers.Rmd 22984 2018-08-11 10:11:01 2018

## 2905 rgdal 1 Yoga_centers.Rmd 22984 2018-08-11 10:11:01 2018

## 2906 RgoogleMaps 1 Yoga_centers.Rmd 22984 2018-08-11 10:11:01 2018

## 2907 sp 1 Yoga_centers.Rmd 22984 2018-08-11 10:11:01 2018

Paso 3: agregando el nombre de los packages

A continuación, agregaremos los paquetes que tengan el mismo nombre (haré la suma en la columna rep) para tener un enfoque estadístico de los paquetes más frecuentes. Esto se puede hacer con la función ddply.

library(plyr)
WDfreq2 <- ddply(WDfreq, .(pk), summarise, rep=sum(rep))
WDfreq2$perc <- (WDfreq2$rep/sum(WDfreq2$rep))*100

library(plyr)

WDfreq2 <- ddply(WDfreq, .(pk), summarise, rep=sum(rep))

WDfreq2$perc <- (WDfreq2$rep/sum(WDfreq2$rep))*100

El resultado es una tidy table tal como:

head(WDfreq2)

1 2	head(WDfreq2)

##                          pk rep       perc
## 1                     abind   2 0.04106776
## 2                 agricolae   4 0.08213552
## 3                    Amelia   4 0.08213552
## 4                 animation   5 0.10266940
## 5 AppliedPredictiveModeling   8 0.16427105
## 6               bartMachine   1 0.02053388

## pk rep perc

## 1 abind 2 0.04106776

## 2 agricolae 4 0.08213552

## 3 Amelia 4 0.08213552

## 4 animation 5 0.10266940

## 5 AppliedPredictiveModeling 8 0.16427105

## 6 bartMachine 1 0.02053388

Paso 4: pintando resultados

¡Ahora llega la parte más divertida! Así que tenemos nuestra tabla de datos lista para hacer algunas gráficas. Abajo encontrarás el código y los resultados usando ggplot2. Voy a dibujar mi TOP 50 de los paquetes más usados:

# ploting (TOP 50)
dfplot <- WDfreq2[order(WDfreq2$perc),]
dfplot <- dfplot[(NROW(dfplot)-50):NROW(dfplot),]
library(ggplot2)
PLOT2 <- ggplot(dfplot, aes(pk, perc, fill=pk)) + geom_bar(stat="identity", alpha=0.5)
PLOT2 <- PLOT2 + theme_minimal(base_size=10) + theme(legend.position="none") + coord_flip()
PLOT2 <- PLOT2 + labs(title="Percentage of library calls in my R & Rmd code", x="package", y="percentage of calls (%)", subtitle="TOP 50") + scale_x_discrete(limits=dfplot$pk)
PLOT2

# ploting (TOP 50)

dfplot <- WDfreq2[order(WDfreq2$perc),]

dfplot <- dfplot[(NROW(dfplot)-50):NROW(dfplot),]

library(ggplot2)

PLOT2 <- ggplot(dfplot, aes(pk, perc, fill=pk)) + geom_bar(stat="identity", alpha=0.5)

PLOT2 <- PLOT2 + theme_minimal(base_size=10) + theme(legend.position="none") + coord_flip()

PLOT2 <- PLOT2 + labs(title="Percentage of library calls in my R & Rmd code", x="package", y="percentage of calls (%)", subtitle="TOP 50") + scale_x_discrete(limits=dfplot$pk)

PLOT2

Finalmente, podemos obtener resultados realmente llamativos creando una gráfica de círculos, para ello he adaptado el código original que encontré en basic-circle-packing-with-one-level y hide-first-level-in-circle-packing. Puede que necesites instalar el paquete packcircles, el resultado se muestra a continuación.

# libraries
library(packcircles)
library(ggplot2)

# Generate the layout.
# This function returns a data frame with one line per bubble. 
# It gives its center (x and y) and its radius, proportional to the value
packing <- circleProgressiveLayout(WDfreq2$rep, sizetype='area')
data <- cbind(WDfreq2, packing)

# The next step is to go from one center + a radius to the coordinates of a circle that
# is drawn by a multitude of straight lines.
dat.gg <- circleLayoutVertices(packing, npoints=50)

# Make the plot
PLOT3 <- ggplot() + geom_polygon(data=dat.gg, aes(x, y, group=id, fill=as.factor(id)), colour="black", alpha=0.5)
PLOT3 <- PLOT3 + geom_text(data=data, aes(x, y, size=rep, label=pk)) + scale_size_continuous(range=c(1,4))
PLOT3 <- PLOT3 + theme_void() + theme(legend.position="none") + coord_equal()
PLOT3

# libraries

library(packcircles)

library(ggplot2)

# Generate the layout.

# This function returns a data frame with one line per bubble.

# It gives its center (x and y) and its radius, proportional to the value

packing <- circleProgressiveLayout(WDfreq2$rep, sizetype='area')

data <- cbind(WDfreq2, packing)

# The next step is to go from one center + a radius to the coordinates of a circle that

# is drawn by a multitude of straight lines.

dat.gg <- circleLayoutVertices(packing, npoints=50)

# Make the plot

PLOT3 <- ggplot() + geom_polygon(data=dat.gg, aes(x, y, group=id, fill=as.factor(id)), colour="black", alpha=0.5)

PLOT3 <- PLOT3 + geom_text(data=data, aes(x, y, size=rep, label=pk)) + scale_size_continuous(range=c(1,4))

PLOT3 <- PLOT3 + theme_void() + theme(legend.position="none") + coord_equal()

PLOT3

Exportando la gráfica

Si quieres exportar el gráfico ejecuta:

# exporting plot3
png(filename="../circle_package_plot.png", width=2000, height=2000, pointsize=12, res=300)
PLOT3

# exporting plot3

png(filename="../circle_package_plot.png", width=2000, height=2000, pointsize=12, res=300)

PLOT3

Y eso es todo, espero que hayas disfrutado y te haya sido útil!

Session Info:

------------------------------------
Total R execution time:  37.3 secs 
------------------------------------

------------------------------------

Total R execution time: 37.3 secs

------------------------------------

 setting  value                       
 version  R version 3.4.3 (2017-11-30)
 os       macOS High Sierra 10.13.5   
 system   x86_64, darwin15.6.0        
 ui       RStudio                     
 language (EN)                        
 collate  es_ES.UTF-8                 
 tz       Europe/Madrid               
 date     2018-08-16

setting value

version R version 3.4.3 (2017-11-30)

os macOS High Sierra 10.13.5

system x86_64, darwin15.6.0

ui RStudio

language (EN)

collate es_ES.UTF-8

tz Europe/Madrid

date 2018-08-16

------------------------------------
Packages:

1 2	------------------------------------ Packages:

[1] "ggplot2 - 2.2.1 - 2016-12-30 - CRAN (R 3.4.0)"                         
[2] "packcircles - 0.3.1 - 2018-01-09 - CRAN (R 3.4.3)"                     
[3] "plyr - 1.8.4 - 2016-06-08 - CRAN (R 3.4.0)"                            
[4] "RWordPress - 0.2-3 - 2018-03-04 - Github (duncantl/RWordPress@ce6d2d6)"
[5] "stringr - 1.2.0 - 2017-02-18 - CRAN (R 3.4.0)"

[1] "ggplot2 - 2.2.1 - 2016-12-30 - CRAN (R 3.4.0)"

[2] "packcircles - 0.3.1 - 2018-01-09 - CRAN (R 3.4.3)"

[3] "plyr - 1.8.4 - 2016-06-08 - CRAN (R 3.4.0)"

[4] "RWordPress - 0.2-3 - 2018-03-04 - Github (duncantl/RWordPress@ce6d2d6)"

[5] "stringr - 1.2.0 - 2017-02-18 - CRAN (R 3.4.0)"

Appendix, all the code:

setwd("~/choose/your/working/directory/")
files_R1 <- list.files("./", pattern = "*.R$", all.files = TRUE, full.names = TRUE, 
    recursive = TRUE, include.dirs = TRUE)
files_R2 <- list.files("./", pattern = "*.Rmd$", all.files = TRUE, full.names = TRUE, 
    recursive = TRUE, include.dirs = TRUE)
FILES <- c(files_R1, files_R2)
files_R1 <- list.files("./R_code-history/", pattern = "*.R$", all.files = TRUE, 
    full.names = TRUE, recursive = TRUE, include.dirs = TRUE)
files_R2 <- list.files("./R_code-history/", pattern = "*.Rmd$", all.files = TRUE, 
    full.names = TRUE, recursive = TRUE, include.dirs = TRUE)
FILES <- c(files_R1, files_R2)
WDfreq <- list()
for (i in 1:NROW(FILES)) {
    if (i/100 == i%/%100) 
        cat("\nfiles loaded:", i, "/", NROW(FILES))
    TEXT <- scan(FILES[i], "character", sep = "\n", quiet = TRUE, encoding = "UTF-8")

    # Split sentence
    WORDS <- strsplit(TEXT, " ")

    # identifiying library & require instrucions
    LIB <- grep("library\\(", WORDS, value = TRUE)
    REQ <- grep("require\\(", WORDS, value = TRUE)
    ##--
    library(stringr)
    LIB_id <- str_locate(LIB, "library\\(")
    REQ_id <- str_locate(REQ, "require\\(")
    LIB <- str_sub(LIB, start = LIB_id[, 2] + 1, end = str_length(LIB))
    REQ <- str_sub(REQ, start = REQ_id[, 2] + 1, end = str_length(REQ))
    ##--
    LIB_id <- str_locate(LIB, "\\)")
    REQ_id <- str_locate(REQ, "\\)")
    LIB <- str_sub(LIB, start = 1, end = LIB_id[, 1])
    REQ <- str_sub(REQ, start = 1, end = REQ_id[, 1])
    ##--
    LIB <- gsub("\\(", "", LIB)
    LIB <- gsub("\\)", "", LIB)
    REQ <- gsub("\\(", "", REQ)
    REQ <- gsub("\\)", "", REQ)
    LIB <- gsub("\\\\\"", "", LIB)
    REQ <- gsub("\\\\\"", "", REQ)
    LIB <- gsub("\\\"", "", LIB)
    REQ <- gsub("\\\"", "", REQ)
    LIB <- gsub("\\'", "", LIB)
    REQ <- gsub("\\'", "", REQ)
    ## 

    # joining
    RESULT <- c(LIB, REQ)
    if (NROW(RESULT) == 0) 
        next

    # word frequencies
    WDfreq[[i]] <- table(unlist(RESULT))
    WDfreq[[i]] <- data.frame(pk = names(WDfreq[[i]]), rep = as.integer(WDfreq[[i]]))

    # adding more columns
    WDfreq[[i]]$file <- basename(FILES[i])
    WDfreq[[i]]$size <- file.info(FILES[i])$size
    WDfreq[[i]]$mtime <- file.info(FILES[i])$mtime
    WDfreq[[i]]$year <- format(WDfreq[[i]]$mtime, format = "%Y")

}
WDfreq <- do.call("rbind", WDfreq)
cat("\nTotal packages:", NROW(WDfreq))
# especific substitution (not generic, there is a particular problem on the
# scan) not to show on post
WDfreq$pk <- gsub(",, warn.conflicts=FALSE", "", WDfreq$pk, fixed = TRUE)
tail(WDfreq)
library(plyr)
WDfreq2 <- ddply(WDfreq, .(pk), summarise, rep = sum(rep))
WDfreq2$perc <- (WDfreq2$rep/sum(WDfreq2$rep)) * 100
head(WDfreq2)
# ploting (TOP 50)
dfplot <- WDfreq2[order(WDfreq2$perc), ]
dfplot <- dfplot[(NROW(dfplot) - 50):NROW(dfplot), ]
library(ggplot2)
PLOT2 <- ggplot(dfplot, aes(pk, perc, fill = pk)) + geom_bar(stat = "identity", 
    alpha = 0.5)
PLOT2 <- PLOT2 + theme_minimal(base_size = 10) + theme(legend.position = "none") + 
    coord_flip()
PLOT2 <- PLOT2 + labs(title = "Percentage of library calls in my R & Rmd code", 
    x = "package", y = "percentage of calls (%)", subtitle = "TOP 50") + scale_x_discrete(limits = dfplot$pk)
PLOT2
# libraries
library(packcircles)
library(ggplot2)

# Generate the layout.  This function returns a data frame with one line per
# bubble.  It gives its center (x and y) and its radius, proportional to the
# value
packing <- circleProgressiveLayout(WDfreq2$rep, sizetype = "area")
data <- cbind(WDfreq2, packing)

# The next step is to go from one center + a radius to the coordinates of a
# circle that is drawn by a multitude of straight lines.
dat.gg <- circleLayoutVertices(packing, npoints = 50)

# Make the plot
PLOT3 <- ggplot() + geom_polygon(data = dat.gg, aes(x, y, group = id, fill = as.factor(id)), 
    colour = "black", alpha = 0.5)
PLOT3 <- PLOT3 + geom_text(data = data, aes(x, y, size = rep, label = pk)) + 
    scale_size_continuous(range = c(1, 4))
PLOT3 <- PLOT3 + theme_void() + theme(legend.position = "none") + coord_equal()
PLOT3
# exporting plot3
png(filename = "../circle_package_plot.png", width = 2000, height = 2000, pointsize = 12, 
    res = 300)
PLOT3

100

101

102

103

104

105

106

107

108

109

110

setwd("~/choose/your/working/directory/")

files_R1 <- list.files("./", pattern = "*.R$", all.files = TRUE, full.names = TRUE,

recursive = TRUE, include.dirs = TRUE)

files_R2 <- list.files("./", pattern = "*.Rmd$", all.files = TRUE, full.names = TRUE,

recursive = TRUE, include.dirs = TRUE)

FILES <- c(files_R1, files_R2)

files_R1 <- list.files("./R_code-history/", pattern = "*.R$", all.files = TRUE,

full.names = TRUE, recursive = TRUE, include.dirs = TRUE)

files_R2 <- list.files("./R_code-history/", pattern = "*.Rmd$", all.files = TRUE,

full.names = TRUE, recursive = TRUE, include.dirs = TRUE)

FILES <- c(files_R1, files_R2)

WDfreq <- list()

for (i in 1:NROW(FILES)) {

if (i/100 == i%/%100)

cat("\nfiles loaded:", i, "/", NROW(FILES))

TEXT <- scan(FILES[i], "character", sep = "\n", quiet = TRUE, encoding = "UTF-8")

# Split sentence

WORDS <- strsplit(TEXT, " ")

# identifiying library & require instrucions

LIB <- grep("library\\(", WORDS, value = TRUE)

REQ <- grep("require\\(", WORDS, value = TRUE)

##--

library(stringr)

LIB_id <- str_locate(LIB, "library\\(")

REQ_id <- str_locate(REQ, "require\\(")

LIB <- str_sub(LIB, start = LIB_id[, 2] + 1, end = str_length(LIB))

REQ <- str_sub(REQ, start = REQ_id[, 2] + 1, end = str_length(REQ))

##--

LIB_id <- str_locate(LIB, "\\)")

REQ_id <- str_locate(REQ, "\\)")

LIB <- str_sub(LIB, start = 1, end = LIB_id[, 1])

REQ <- str_sub(REQ, start = 1, end = REQ_id[, 1])

##--

LIB <- gsub("\\(", "", LIB)

LIB <- gsub("\\)", "", LIB)

REQ <- gsub("\\(", "", REQ)

REQ <- gsub("\\)", "", REQ)

LIB <- gsub("\\\\\"", "", LIB)

REQ <- gsub("\\\\\"", "", REQ)

LIB <- gsub("\\\"", "", LIB)

REQ <- gsub("\\\"", "", REQ)

LIB <- gsub("\\'", "", LIB)

REQ <- gsub("\\'", "", REQ)

# joining

RESULT <- c(LIB, REQ)

if (NROW(RESULT) == 0)

# word frequencies

WDfreq[[i]] <- table(unlist(RESULT))

WDfreq[[i]] <- data.frame(pk = names(WDfreq[[i]]), rep = as.integer(WDfreq[[i]]))

# adding more columns

WDfreq[[i]]$file <- basename(FILES[i])

WDfreq[[i]]$size <- file.info(FILES[i])$size

WDfreq[[i]]$mtime <- file.info(FILES[i])$mtime

WDfreq[[i]]$year <- format(WDfreq[[i]]$mtime, format = "%Y")

}

WDfreq <- do.call("rbind", WDfreq)

cat("\nTotal packages:", NROW(WDfreq))

# especific substitution (not generic, there is a particular problem on the

# scan) not to show on post

WDfreq$pk <- gsub(",, warn.conflicts=FALSE", "", WDfreq$pk, fixed = TRUE)

tail(WDfreq)

library(plyr)

WDfreq2 <- ddply(WDfreq, .(pk), summarise, rep = sum(rep))

WDfreq2$perc <- (WDfreq2$rep/sum(WDfreq2$rep)) * 100

head(WDfreq2)

# ploting (TOP 50)

dfplot <- WDfreq2[order(WDfreq2$perc), ]

dfplot <- dfplot[(NROW(dfplot) - 50):NROW(dfplot), ]

library(ggplot2)

PLOT2 <- ggplot(dfplot, aes(pk, perc, fill = pk)) + geom_bar(stat = "identity",

alpha = 0.5)

PLOT2 <- PLOT2 + theme_minimal(base_size = 10) + theme(legend.position = "none") +

coord_flip()

PLOT2 <- PLOT2 + labs(title = "Percentage of library calls in my R & Rmd code",

x = "package", y = "percentage of calls (%)", subtitle = "TOP 50") + scale_x_discrete(limits = dfplot$pk)

PLOT2

# libraries

library(packcircles)

library(ggplot2)

# Generate the layout. This function returns a data frame with one line per

# bubble. It gives its center (x and y) and its radius, proportional to the

# value

packing <- circleProgressiveLayout(WDfreq2$rep, sizetype = "area")

data <- cbind(WDfreq2, packing)

# The next step is to go from one center + a radius to the coordinates of a

# circle that is drawn by a multitude of straight lines.

dat.gg <- circleLayoutVertices(packing, npoints = 50)

# Make the plot

PLOT3 <- ggplot() + geom_polygon(data = dat.gg, aes(x, y, group = id, fill = as.factor(id)),

colour = "black", alpha = 0.5)

PLOT3 <- PLOT3 + geom_text(data = data, aes(x, y, size = rep, label = pk)) +

scale_size_continuous(range = c(1, 4))

PLOT3 <- PLOT3 + theme_void() + theme(legend.position = "none") + coord_equal()

PLOT3

# exporting plot3

png(filename = "../circle_package_plot.png", width = 2000, height = 2000, pointsize = 12,

res = 300)

PLOT3

Share it!:

un blog sobre "data science"

Círculos proporcionales a la frecuencia de uso de paquetes de R

Paso 1: listando todos los archivos

Paso 2: leyendo los archivos de R

Paso 3: agregando el nombre de los packages

Paso 4: pintando resultados

Exportando la gráfica

Leave a Reply Cancel reply