Index a file for faster access to parts of the file

make_file_index(
  pathname,
  offset = NULL,
  skip = 0L,
  index = NULL,
  n_max = Inf,
  newline = "\n",
  drop_eof = TRUE,
  bfr_size = 5e+07
)

save_file_index(index, file)

read_file_index(file)

Arguments

pathname

(character) The file to be indexed.

offset

(numeric) The number of bytes to skip before start indexing.

skip

(numeric) The number of newline matches to ignore before recording them.

index

(numeric vector) A sorted index of file byte positions.

n_max

(numeric) The maximum number of bytes to scan.

newline

(character) The character to scan for.

drop_eof

(logical) If TRUE, the last identified byte offset is dropped if at the very end of the file, i.e. when there is nothing available to read from that position.

bfr_size

(numeric) The number of bytes to read in each iteration.

file

A pathname to a *.index file to be created or read from.

Value

A numeric vector of file byte offsets that corresponds to the beginning of a line, i.e. a position in the file that was preceeded by a newline character. The first line is at file byte offset 0, which is also always the first element in the returned vector.

Examples

## An SGE accounting file
pathname <- system.file("exdata", "accounting", package = "wyntonquery")

## The corresponding SGE accounting index file
pathname_index <- sprintf("%s.index", pathname)

## Scan SGE accounting file to identify job offset positions
index <- make_file_index(pathname)
cat(sprintf("Number of jobs: %d\n", length(index)))
#> Number of jobs: 1004
str(index)
#>  num [1:1004] 0 17 20 56 59 ...

## Save index to file
tf <- tempfile(fileext = ".index")
save_file_index(index, file = tf)
cat(sprintf("Saved index file: %s (%d bytes)\n", pathname, file.size(tf)))
#> Saved index file: /home/runner/work/_temp/Library/wyntonquery/exdata/accounting (8032 bytes)

## Read index from file
index <- read_file_index(tf)
cat(sprintf("Number of jobs: %d\n", length(index)))
#> Number of jobs: 1004
str(index)
#>  num [1:1004] 0 17 20 56 59 ...

## Read jobs 301 to 350
jobs <- read_sge_accounting(pathname, offset = index[301], n_max = 50L)
print(jobs)
#> # A tibble: 50 × 45
#>    qname    hostname group  owner   job_name         job_number account priority
#>    <chr>    <chr>    <chr>  <chr>   <chr>                 <int> <chr>      <int>
#>  1 member.q qb3-id1  group4 owner09 sleep.sh                  2 sge            0
#>  2 member.q qb3-id1  group4 owner09 sleep.sh                  2 sge            0
#>  3 member.q qb3-id1  group4 owner09 sleep.sh                  2 sge            0
#>  4 member.q qb3-id1  group4 owner09 sleep.sh                  2 sge            0
#>  5 member.q qb3-id1  group4 owner09 sleep.sh                  2 sge            0
#>  6 long.q   cc-id3   group6 owner01 AD_196_bvFTD_65…          3 sge           19
#>  7 long.q   qb3-id1  group6 owner01 AD_196_bvFTD_65…          4 sge           19
#>  8 long.q   cc-id2   group6 owner01 AD_196_PSP_58_m…          7 sge           19
#>  9 long.q   cin-id3  group6 owner01 bvFTD_65_CONTRO…          9 sge           19
#> 10 long.q   cc-id1   group6 owner01 AD_196_svPPA_46…          8 sge           19
#> # ℹ 40 more rows
#> # ℹ 37 more variables: submission_time <dttm>, start_time <dttm>,
#> #   end_time <dttm>, failed <int>, exit_status <int>, ru_wallclock <drtn>,
#> #   ru_utime <drtn>, ru_stime <drtn>, ru_maxrss <chr>, ru_ixrss <chr>,
#> #   ru_ismrss <chr>, ru_idrss <chr>, ru_isrss <chr>, ru_minflt <dbl>,
#> #   ru_majflt <dbl>, ru_nswap <dbl>, ru_inblock <dbl>, ru_oublock <dbl>,
#> #   ru_msgsnd <dbl>, ru_msgrcv <dbl>, ru_nsignals <dbl>, ru_nvcsw <dbl>, …

## Read all jobs *after* the 500:th job
jobs <- read_sge_accounting(pathname, offset = index[501])
print(jobs)
#> # A tibble: 504 × 45
#>    qname  hostname  group  owner   job_name       job_number account priority
#>    <chr>  <chr>     <chr>  <chr>   <chr>               <int> <chr>      <int>
#>  1 long.q cc-id3    group6 owner07 job_qb3.sh            230 sge           19
#>  2 long.q cin-hmid1 group6 owner01 bmle_1_4_s3.sh        233 sge           19
#>  3 long.q cin-id2   group6 owner01 bmle_1_5_s3.sh        229 sge           19
#>  4 long.q cc-id1    group6 owner06 job.sh                147 sge           19
#>  5 long.q cc-hmid1  group3 owner04 cc-hmid1              238 sge           19
#>  6 long.q cc-id1    group3 owner04 cc-id1                239 sge           19
#>  7 long.q cin-hmid1 group3 owner04 cin-hmid1             242 sge           19
#>  8 long.q qb3-hmid1 group3 owner04 qb3-hmid1             246 sge           19
#>  9 long.q cin-id3   group3 owner04 cin-id3               245 sge           19
#> 10 long.q qb3-id1   group3 owner04 qb3-id1               247 sge           19
#> # ℹ 494 more rows
#> # ℹ 37 more variables: submission_time <dttm>, start_time <dttm>,
#> #   end_time <dttm>, failed <int>, exit_status <int>, ru_wallclock <drtn>,
#> #   ru_utime <drtn>, ru_stime <drtn>, ru_maxrss <chr>, ru_ixrss <chr>,
#> #   ru_ismrss <chr>, ru_idrss <chr>, ru_isrss <chr>, ru_minflt <dbl>,
#> #   ru_majflt <dbl>, ru_nswap <dbl>, ru_inblock <dbl>, ru_oublock <dbl>,
#> #   ru_msgsnd <dbl>, ru_msgrcv <dbl>, ru_nsignals <dbl>, ru_nvcsw <dbl>, …

## Cleanup
file.remove(tf)
#> [1] TRUE