download.file()
!!! (41,389,465 rows of data)read_csv()
arrow::open_dataset()
ISBN
is empty for 80k rows, so we specifyseattle_csv |> glimpse()
#> FileSystemDataset with 1 csv file
#> 41,389,465 rows x 12 columns
#> $ UsageClass <string> "Physical", "Physical", "Digital", "Physical", "Ph…
#> $ CheckoutType <string> "Horizon", "Horizon", "OverDrive", "Horizon", "Hor…
#> $ MaterialType <string> "BOOK", "BOOK", "EBOOK", "BOOK", "SOUNDDISC", "BOO…
#> $ CheckoutYear <int64> 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 20…
#> $ CheckoutMonth <int64> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,…
#> $ Checkouts <int64> 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 2, 3, 2, 1, 3, 2,…
#> $ Title <string> "Super rich : a guide to having it all / Russell S…
#> $ ISBN <string> "", "", "", "", "", "", "", "", "", "", "", "", ""…
#> $ Creator <string> "Simmons, Russell", "Barclay, James, 1965-", "Tim …
#> $ Subjects <string> "Self realization, Conduct of life, Attitude Psych…
#> $ Publisher <string> "Gotham Books,", "Pyr,", "Random House, Inc.", "Di…
#> $ PublicationYear <string> "c2011.", "2010.", "2015", "2005.", "c2004.", "c20…
seattle_csv |>
group_by(CheckoutYear) |>
summarise(Checkouts = sum(Checkouts)) |>
arrange(CheckoutYear) |>
collect()
#> # A tibble: 18 × 2
#> CheckoutYear Checkouts
#> <int> <int>
#> 1 2005 3798685
#> 2 2006 6599318
#> 3 2007 7126627
#> 4 2008 8438486
#> 5 2009 9135167
#> 6 2010 8608966
#> # ℹ 12 more rows
parquet
datasets with {arrow}
But:
dplyr::group_by()
to define partitionsarrow::write_dataset()
to save as parquettibble(
files = list.files(pq_path, recursive = TRUE),
size_MB = file.size(file.path(pq_path, files)) / 1024^2
)
#> # A tibble: 18 × 2
#> files size_MB
#> <chr> <dbl>
#> 1 CheckoutYear=2005/part-0.parquet 109.
#> 2 CheckoutYear=2006/part-0.parquet 164.
#> 3 CheckoutYear=2007/part-0.parquet 178.
#> 4 CheckoutYear=2008/part-0.parquet 195.
#> 5 CheckoutYear=2009/part-0.parquet 214.
#> 6 CheckoutYear=2010/part-0.parquet 222.
#> # ℹ 12 more rows
?arrow-dplyr
for supported functions
?acero
but that’s way harder to remember)