首页 > 解决方案 > R:如何按人类可读的大小排序

问题描述

我有一组数据,我试图按大小排序,但是项目的大小从 ~140K 到 ~130G 变化很大,因此按字节显示根本不容易读取。我可以将输入数据更改为人类可读的大小,但是当我将其绘制出来时,它并没有像人们期望的那样排序。我将如何按人类可读的方式对其进行排序?

代码:

library(ggplot2)

mydata <- read.csv("/path/to/test.csv")
restore.df = data.frame(
    Start = as.POSIXct(mydata$start),
    Size = mydata$size,
    Labels = gsub(" [0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}","",mydata$start)
)

p <- ggplot(restore.df, aes(x=Start,y=Size)) + geom_point()
p + scale_x_date(date_labels = "%y-%m-%d", limits = as.Date('2018-06-14', "%y-%m-%d"), as.Date('2018-06-20', "%Y-%m-%d"))

png(filename="/path/to/test.png",width=1368,height=1060,units="px")
print(p)
dev.off()

结果: 结果

减少数据集:

start,stop,time,size
"2018-06-14 17:30:05","2018-06-14 17:30:05",3.6,7.3G
"2018-06-14 17:33:47","2018-06-14 17:33:47",1.05,304M
"2018-06-14 17:35:07","2018-06-14 17:35:07",62.9666666666667,132G
"2018-06-14 23:33:51","2018-06-14 23:33:51",0,880K
"2018-06-14 23:34:13","2018-06-14 23:34:13",1.16666666666667,305M
"2018-06-17 01:34:56","2018-06-17 01:34:56",20.2666666666667,6.2G
"2018-06-17 01:56:13","2018-06-17 01:56:13",15.7833333333333,9.4G
"2018-06-22 17:34:33","2018-06-22 17:34:33",0,144K

标签: rggplot2

解决方案


我不确定是否已经有一个包可以转换它,但您可以转换尺寸并手动排列它们。然后根据需要绘制和调整 y 轴标签。

library(dplyr)
library(ggplot2)
d <- structure(list(start = c("2018-06-14 17:30:05", "2018-06-14 17:33:47", 
                              "2018-06-14 17:35:07", "2018-06-14 23:33:51", "2018-06-14 23:34:13", 
                              "2018-06-17 01:34:56", "2018-06-17 01:56:13", "2018-06-22 17:34:33"), 
                    stop = c("2018-06-14 17:30:05", "2018-06-14 17:33:47", "2018-06-14 17:35:07", 
                             "2018-06-14 23:33:51", "2018-06-14 23:34:13", "2018-06-17 01:34:56", 
                             "2018-06-17 01:56:13", "2018-06-22 17:34:33"), 
                    time = c(3.6, 1.05, 62.9666666666667, 0, 1.16666666666667, 20.2666666666667, 
                             15.7833333333333, 0), 
                    size = c("7.3G", "304M", "132G", "880K", "305M", "6.2G", "9.4G", "144K")), 
                    .Names = c("start", "stop", "time", "size"), class = "data.frame", row.names = c(NA, -8L))

## function to convert sizes
convert_size <- function(x){
  ## if all numbers
  if(grepl('^[0-9]{1,}$', x)) return(x)
  ## convert when not
  prefix <- substr(x, nchar(x), nchar(x))
  n <- substr(x, 1, nchar(x)-1)
  fct <- dplyr::case_when(
    prefix == 'K' ~ 1024,
    prefix == 'M' ~ 1024^2,
    prefix == 'G' ~ 1024^3,
    prefix == 'T' ~ 1024^4,
  )
  xx <- as.numeric(n)*fct
  return(xx)
}

d2 <- d %>% mutate(fsize = sapply(size, convert_size)) %>% arrange(fsize)

restore.df = data.frame(
  Start = as.POSIXct(d2$start),
  Size = d2$size,
  FSize = d2$fsize,
  Labels = gsub(" [0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}","",d2$start)
)
print(restore.df)
#>                 Start Size        FSize     Labels
#> 1 2018-06-22 17:34:33 144K       147456 2018-06-22
#> 2 2018-06-14 23:33:51 880K       901120 2018-06-14
#> 3 2018-06-14 17:33:47 304M    318767104 2018-06-14
#> 4 2018-06-14 23:34:13 305M    319815680 2018-06-14
#> 5 2018-06-17 01:34:56 6.2G   6442450944 2018-06-17
#> 6 2018-06-14 17:30:05 7.3G   7516192768 2018-06-14
#> 7 2018-06-17 01:56:13 9.4G   9663676416 2018-06-17
#> 8 2018-06-14 17:35:07 132G 141733920768 2018-06-14

## plot
# adjust for breaks
bks <- c('100K','1M','100M','1G','10G','100G')
p <- ggplot(restore.df, aes(x=as.Date(Start),y=FSize)) + geom_point()
p + scale_x_date(date_labels = "%Y-%m-%d", limits = c(as.Date('2018-06-14', "%Y-%m-%d"), 
             as.Date('2018-06-20', "%Y-%m-%d"))) + 
  scale_y_log10(breaks = sapply(bks, convert_size), labels = bks)

#Created on 2018-07-24 by the [reprex package](http://reprex.tidyverse.org) (v0.2.0.9000).

推荐阅读