Index a file for faster access to parts of the file

make_file_index(
  pathname,
  offset = NULL,
  skip = 0L,
  index = NULL,
  n_max = Inf,
  newline = "\n",
  drop_eof = TRUE,
  bfr_size = 5e+07
)

save_file_index(index, file)

read_file_index(file)

Arguments

pathname: (character) The file to be indexed.
offset: (numeric) The number of bytes to skip before start indexing.
skip: (numeric) The number of newline matches to ignore before recording them.
index: (numeric vector) A sorted index of file byte positions.
n_max: (numeric) The maximum number of bytes to scan.
newline: (character) The character to scan for.
drop_eof: (logical) If TRUE, the last identified byte offset is dropped if at the very end of the file, i.e. when there is nothing available to read from that position.
bfr_size: (numeric) The number of bytes to read in each iteration.
file: A pathname to a *.index file to be created or read from.

Value

A numeric vector of file byte offsets that corresponds to the beginning of a line, i.e. a position in the file that was preceeded by a newline character. The first line is at file byte offset 0, which is also always the first element in the returned vector.

Examples

## An SGE accounting file
pathname <- system.file("exdata", "accounting", package = "wyntonquery")

## The corresponding SGE accounting index file
pathname_index <- sprintf("%s.index", pathname)

## Scan SGE accounting file to identify job offset positions
index <- make_file_index(pathname)
cat(sprintf("Number of jobs: %d\n", length(index)))
#> Number of jobs: 1004
str(index)
#>  num [1:1004] 0 17 20 56 59 ...

## Save index to file
tf <- tempfile(fileext = ".index")
save_file_index(index, file = tf)
cat(sprintf("Saved index file: %s (%d bytes)\n", pathname, file.size(tf)))
#> Saved index file: /home/runner/work/_temp/Library/wyntonquery/exdata/accounting (8032 bytes)

## Read index from file
index <- read_file_index(tf)
cat(sprintf("Number of jobs: %d\n", length(index)))
#> Number of jobs: 1004
str(index)
#>  num [1:1004] 0 17 20 56 59 ...

## Read jobs 301 to 350
jobs <- read_sge_accounting(pathname, offset = index[301], n_max = 50L)
print(jobs)
#> # A tibble: 50 × 45
#>    qname    hostname group  owner   job_name         job_number account priority
#>    <chr>    <chr>    <chr>  <chr>   <chr>                 <int> <chr>      <int>
#>  1 member.q qb3-id1  group4 owner09 sleep.sh                  2 sge            0
#>  2 member.q qb3-id1  group4 owner09 sleep.sh                  2 sge            0
#>  3 member.q qb3-id1  group4 owner09 sleep.sh                  2 sge            0
#>  4 member.q qb3-id1  group4 owner09 sleep.sh                  2 sge            0
#>  5 member.q qb3-id1  group4 owner09 sleep.sh                  2 sge            0
#>  6 long.q   cc-id3   group6 owner01 AD_196_bvFTD_65…          3 sge           19
#>  7 long.q   qb3-id1  group6 owner01 AD_196_bvFTD_65…          4 sge           19
#>  8 long.q   cc-id2   group6 owner01 AD_196_PSP_58_m…          7 sge           19
#>  9 long.q   cin-id3  group6 owner01 bvFTD_65_CONTRO…          9 sge           19
#> 10 long.q   cc-id1   group6 owner01 AD_196_svPPA_46…          8 sge           19
#> # ℹ 40 more rows
#> # ℹ 37 more variables: submission_time <dttm>, start_time <dttm>,
#> #   end_time <dttm>, failed <int>, exit_status <int>, ru_wallclock <drtn>,
#> #   ru_utime <drtn>, ru_stime <drtn>, ru_maxrss <chr>, ru_ixrss <chr>,
#> #   ru_ismrss <chr>, ru_idrss <chr>, ru_isrss <chr>, ru_minflt <dbl>,
#> #   ru_majflt <dbl>, ru_nswap <dbl>, ru_inblock <dbl>, ru_oublock <dbl>,
#> #   ru_msgsnd <dbl>, ru_msgrcv <dbl>, ru_nsignals <dbl>, ru_nvcsw <dbl>, …

## Read all jobs *after* the 500:th job
jobs <- read_sge_accounting(pathname, offset = index[501])
print(jobs)
#> # A tibble: 504 × 45
#>    qname  hostname  group  owner   job_name       job_number account priority
#>    <chr>  <chr>     <chr>  <chr>   <chr>               <int> <chr>      <int>
#>  1 long.q cc-id3    group6 owner07 job_qb3.sh            230 sge           19
#>  2 long.q cin-hmid1 group6 owner01 bmle_1_4_s3.sh        233 sge           19
#>  3 long.q cin-id2   group6 owner01 bmle_1_5_s3.sh        229 sge           19
#>  4 long.q cc-id1    group6 owner06 job.sh                147 sge           19
#>  5 long.q cc-hmid1  group3 owner04 cc-hmid1              238 sge           19
#>  6 long.q cc-id1    group3 owner04 cc-id1                239 sge           19
#>  7 long.q cin-hmid1 group3 owner04 cin-hmid1             242 sge           19
#>  8 long.q qb3-hmid1 group3 owner04 qb3-hmid1             246 sge           19
#>  9 long.q cin-id3   group3 owner04 cin-id3               245 sge           19
#> 10 long.q qb3-id1   group3 owner04 qb3-id1               247 sge           19
#> # ℹ 494 more rows
#> # ℹ 37 more variables: submission_time <dttm>, start_time <dttm>,
#> #   end_time <dttm>, failed <int>, exit_status <int>, ru_wallclock <drtn>,
#> #   ru_utime <drtn>, ru_stime <drtn>, ru_maxrss <chr>, ru_ixrss <chr>,
#> #   ru_ismrss <chr>, ru_idrss <chr>, ru_isrss <chr>, ru_minflt <dbl>,
#> #   ru_majflt <dbl>, ru_nswap <dbl>, ru_inblock <dbl>, ru_oublock <dbl>,
#> #   ru_msgsnd <dbl>, ru_msgrcv <dbl>, ru_nsignals <dbl>, ru_nvcsw <dbl>, …

## Cleanup
file.remove(tf)
#> [1] TRUE