Index a file for faster access to parts of the file
make_file_index(
pathname,
offset = 0,
skip = 0L,
n_max = Inf,
newline = "\n",
drop_eof = TRUE,
bfr_size = 5e+07
)
save_file_index(index, file)
read_file_index(file)
(character) The file to be indexed.
(numeric) The number of bytes to skip before start indexing.
(numeric) The number of newline
matches to ignore before
recording them.
(numeric) The maximum number of bytes to scan.
(character) The character to scan for.
(logical) If TRUE, the last identified byte offset is dropped if at the very end of the file, i.e. when there is nothing available to read from that position.
(numeric) The number of bytes to read in each iteration.
(numeric vector) A sorted index of file byte positions.
A pathname to a *.index
file to be
created or read from.
A numeric vector of file byte offsets that corresponds to the
beginning of a line, i.e. a position in the file that was preceeded
by a newline
character. The first line is at file byte offset
0
, which is also always the first element in the returned vector.
## An SGE accounting file
pathname <- system.file("exdata", "accounting", package = "wyntonquery")
## The corresponding SGE accounting index file
pathname_index <- sprintf("%s.index", pathname)
## Scan SGE accounting file to identify job offset positions
index <- make_file_index(pathname)
cat(sprintf("Number of jobs: %d\n", length(index)))
#> Number of jobs: 1004
str(index)
#> num [1:1004] 0 17 20 56 59 ...
## Save index to file
tf <- tempfile(fileext = ".index")
save_file_index(index, file = tf)
cat(sprintf("Saved index file: %s (%d bytes)\n", pathname, file.size(tf)))
#> Saved index file: /home/runner/work/_temp/Library/wyntonquery/exdata/accounting (8032 bytes)
## Read index from file
index <- read_file_index(tf)
cat(sprintf("Number of jobs: %d\n", length(index)))
#> Number of jobs: 1004
str(index)
#> num [1:1004] 0 17 20 56 59 ...
## Read jobs 301 to 350
jobs <- read_sge_accounting(pathname, offset = index[301], n_max = 50L)
print(jobs)
#> # A tibble: 50 × 45
#> qname hostname group owner job_name job_number account priority
#> <chr> <chr> <chr> <chr> <chr> <int> <chr> <int>
#> 1 member.q qb3-id1 group4 owner09 sleep.sh 2 sge 0
#> 2 member.q qb3-id1 group4 owner09 sleep.sh 2 sge 0
#> 3 member.q qb3-id1 group4 owner09 sleep.sh 2 sge 0
#> 4 member.q qb3-id1 group4 owner09 sleep.sh 2 sge 0
#> 5 member.q qb3-id1 group4 owner09 sleep.sh 2 sge 0
#> 6 long.q cc-id3 group6 owner01 AD_196_bvFTD_65… 3 sge 19
#> 7 long.q qb3-id1 group6 owner01 AD_196_bvFTD_65… 4 sge 19
#> 8 long.q cc-id2 group6 owner01 AD_196_PSP_58_m… 7 sge 19
#> 9 long.q cin-id3 group6 owner01 bvFTD_65_CONTRO… 9 sge 19
#> 10 long.q cc-id1 group6 owner01 AD_196_svPPA_46… 8 sge 19
#> # ℹ 40 more rows
#> # ℹ 37 more variables: submission_time <dttm>, start_time <dttm>,
#> # end_time <dttm>, failed <int>, exit_status <int>, ru_wallclock <drtn>,
#> # ru_utime <drtn>, ru_stime <drtn>, ru_maxrss <chr>, ru_ixrss <chr>,
#> # ru_ismrss <chr>, ru_idrss <chr>, ru_isrss <chr>, ru_minflt <dbl>,
#> # ru_majflt <dbl>, ru_nswap <dbl>, ru_inblock <dbl>, ru_oublock <dbl>,
#> # ru_msgsnd <dbl>, ru_msgrcv <dbl>, ru_nsignals <dbl>, ru_nvcsw <dbl>, …
## Read all jobs *after* the 500:th job
jobs <- read_sge_accounting(pathname, offset = index[501])
print(jobs)
#> # A tibble: 504 × 45
#> qname hostname group owner job_name job_number account priority
#> <chr> <chr> <chr> <chr> <chr> <int> <chr> <int>
#> 1 long.q cc-id3 group6 owner07 job_qb3.sh 230 sge 19
#> 2 long.q cin-hmid1 group6 owner01 bmle_1_4_s3.sh 233 sge 19
#> 3 long.q cin-id2 group6 owner01 bmle_1_5_s3.sh 229 sge 19
#> 4 long.q cc-id1 group6 owner06 job.sh 147 sge 19
#> 5 long.q cc-hmid1 group3 owner04 cc-hmid1 238 sge 19
#> 6 long.q cc-id1 group3 owner04 cc-id1 239 sge 19
#> 7 long.q cin-hmid1 group3 owner04 cin-hmid1 242 sge 19
#> 8 long.q qb3-hmid1 group3 owner04 qb3-hmid1 246 sge 19
#> 9 long.q cin-id3 group3 owner04 cin-id3 245 sge 19
#> 10 long.q qb3-id1 group3 owner04 qb3-id1 247 sge 19
#> # ℹ 494 more rows
#> # ℹ 37 more variables: submission_time <dttm>, start_time <dttm>,
#> # end_time <dttm>, failed <int>, exit_status <int>, ru_wallclock <drtn>,
#> # ru_utime <drtn>, ru_stime <drtn>, ru_maxrss <chr>, ru_ixrss <chr>,
#> # ru_ismrss <chr>, ru_idrss <chr>, ru_isrss <chr>, ru_minflt <dbl>,
#> # ru_majflt <dbl>, ru_nswap <dbl>, ru_inblock <dbl>, ru_oublock <dbl>,
#> # ru_msgsnd <dbl>, ru_msgrcv <dbl>, ru_nsignals <dbl>, ru_nvcsw <dbl>, …
## Cleanup
file.remove(tf)
#> [1] TRUE