Introduction

The SGE accounting file holds information on all submitted and finished jobs submitted to the SGE scheduler. Queued or currently running jobs are not included in this file. A job is considered “finished” if it completed successfully, terminated due to an error, or was cancelled. The file is a colon-delimited text file with one job entry per row, with the most recently finished job appended at end. Contrary to what one might expect, the file is not perfectly ordered by the end_time of the jobs. We might find some entries where the end_time of two consequentive entries might differ by a few seconds in the “wrong” order. It’s not clear to me why this is but it might be that the scheduler updates the SGE accounting file at regular intervals, say every few minutes, and when it does it goes through the jobs in order of job index.

Indexing the SGE accounting file

In March 2020, the Wynton SGE accounting file was ~12 GB and took 6-8 minutes to read. In June 2021, it was ~58 GB and had 161 million job entries. In October 2021, it was ~69 GB and had 192 million job entries. The accounting file was “rolled over” on 2022-03-02, that is, the old file was renamed and replaced by a new one.

As seen below, in December 2024, the new accounting file was ~112 GB and had 300 million job entries and it took wc -l a little bit more than 4 minutes to count them.

It is rare to be interested in all entries. It is more common to work with a subset of the job entries. In order to do this efficiently, we start by indexing the SGE accounting file to identify the file byte offset for each job entry;

library(wyntonquery)
library(progressr)
handlers(global = TRUE) ## Report on progress
handlers("cli")

pathname <- sge_accounting_file()
cat(sprintf("File size: %.3g GB\n", file.size(pathname)/1000^3))
#> File size: 112 GB

## It takes ~15 minutes to index a 112 GB SGE accounting file.
## Indexing requires only a small amount of memory, i.e. only
## a portion of the accounting file is in memory at any time.
index <- make_file_index(pathname, skip = 4L)
cat(sprintf("Number of job entries: %d\n", length(index)))
#> Number of job entries: 300509469

## Save per-job index to file
save_file_index(index, file = "accounting.index_by_row")
cat(sprintf("File size: %.3g GB\n", file.size("accounting.index_by_row")/1000^3))                                                  
#> File size: 2.4 GB

Next, we want to group the job entries by the ISO-6801 week of the job end times.

library(wyntonquery)
library(progressr)
handlers(global = TRUE) ## Report on progress
handlers("cli")

pathname <- sge_accounting_file()
index <- read_file_index("accounting.index_by_row")
cat(sprintf("Number of job entries: %d\n", length(index)))
#> Number of job entries: 300509469

## It takes ~30 minutes to build the week index for 300 million entries
week_index <- sge_make_week_index(pathname, index = index)
saveRDS(week_index, file = "accounting.index_by_week.rds")
cat(sprintf("File size: %.3g kB\n", file.size("accounting.index_by_week.rds")/1000))
#> File size: 2.11 kB

range(week_index$week, na.rm = TRUE)
#> [1] "2022W09" "2024W50"

print(week_index)
# A tibble: 144 × 3
   week    nbr_of_jobs file_offset
   <chr>         <dbl>       <dbl>
 1 NA               28        1188
 2 2022W09     1177975        9628
 3 2022W10     1578885   421146956
 4 2022W11     1672371  1000012580
 5 2022W12      843393  1617407461
 6 2022W13     1688776  1930692024
 7 2022W14     1324258  2551671766
 8 2022W15     1413045  3044101933
 9 2022W16     1435363  3567520111
10 2022W17     1825431  4101619930
# ℹ 134 more rows
# ℹ Use `print(n = ...)` to see more rows

Reading job entries in SGE accounting file

Here is an example how we can read the job entries for a couple of weeks:

library(wyntonquery)
library(dplyr)

pathname <- sge_accounting_file()
week_index <- readRDS("accounting.index_by_week.rds")

weeks <- subset(week_index, week %in% c("2024W40", "2024W41"))
print(weeks)
#> # A tibble: 2 × 3
#>   week    nbr_of_jobs file_offset
#>   <chr>         <dbl>       <dbl>
#> 1 2024W40     3466942 96963551199
#> 2 2024W41     4598813 98239520943

offset <- weeks$file_offset[1]
n_max <- sum(weeks$nbr_of_jobs)
cat(sprintf("Number of job entries to read: %d\n", n_max))
#> Number of job entries to read: 8065755

## It takes ~45 seconds to read the ~8 million job entries of interest
jobs <- read_sge_accounting(pathname, offset = offset, n_max = n_max)

## We anonymize the content so we can share it publically
jobs <- anonymize(jobs)
print(head(select(jobs, -account)))
#> # A tibble: 6 × 44
#>   qname    hostname group owner job_name job_number priority submission_time    
#>   <chr>    <chr>    <chr> <chr> <chr>         <int>    <int> <dttm>             
#> 1 member.q qb3-id1… grou… owne… rundock…    3535323        0 2024-09-27 22:05:07
#> 2 long.q   qb3-atg… grou… owne… batch_3d    3542680       19 2024-09-29 23:11:20
#> 3 long.q   qb3-id2… grou… owne… batch_3d    3542680       19 2024-09-29 23:11:20
#> 4 long.q   qb3-id1… grou… owne… batch_3d    3542680       19 2024-09-29 23:11:20
#> 5 member.q qb3-id2… grou… owne… rundock…    3430911        0 2024-09-21 12:34:07
#> 6 member.q qb3-id2… grou… owne… batch_3d    3542680        0 2024-09-29 23:11:20
#> # ℹ 36 more variables: start_time <dttm>, end_time <dttm>, failed <int>,
#> #   exit_status <int>, ru_wallclock <drtn>, ru_utime <drtn>, ru_stime <drtn>,
#> #   ru_maxrss <chr>, ru_ixrss <chr>, ru_ismrss <chr>, ru_idrss <chr>,
#> #   ru_isrss <chr>, ru_minflt <dbl>, ru_majflt <dbl>, ru_nswap <dbl>,
#> #   ru_inblock <dbl>, ru_oublock <dbl>, ru_msgsnd <dbl>, ru_msgrcv <dbl>,
#> #   ru_nsignals <dbl>, ru_nvcsw <dbl>, ru_nivcsw <dbl>, project <chr>,
#> #   department <chr>, granted_pe <chr>, slots <int>, task_number <int>, …

Statistics on these jobs

period <- range(jobs$end_time, na.rm=TRUE)
cat(sprintf("Period: %s/%s\n", period[1], period[2]))
#> Period: 2024-09-19 22:59:22/2024-10-14 00:00:00

jobs <- add_weeks(jobs)
period <- range(jobs$end_time_week, na.rm=TRUE)
cat(sprintf("Period (weeks): %s/%s\n", period[1], period[2]))
#> Period (weeks): 2024W38/2024W42

cat(sprintf("Number of jobs finished during this period: %d\n", nrow(jobs)))
#> Number of jobs finished during this period: 8065755

nusers <- length(unique(jobs$owner))
cat(sprintf("Number of unique users: %d\n", nusers))
#> Number of unique users: 276

Let’s see how many of the jobs finished succesfully and how many failed.

## Get successful and failed jobs
groups <- list(
  success = subset(jobs, failed == 0L),
  fail    = subset(jobs, failed > 0L)
)
stats <- sapply(groups, nrow)
print(stats)
#> success    fail
#> 7957155  108600

print(stats/sum(stats))
#>    success       fail 
#> 0.98653567 0.01346433 

We see that failure rate among the ~8 million jobs that ran during this period was ~1.4%. Next, let’s see how much CPU time this corresponds to:

## CPU time consumed
cpu <- lapply(groups, function(jobs) { d <- sum(jobs$cpu); units(d) <- "days"; d })
total <- tibble(outcome = names(cpu), cpu = do.call(c, cpu))
cat(sprintf("Total CPU processing time: %.1f %s\n", sum(total$cpu), units(total$cpu)))
#> Total CPU processing time: 163053.1 days

print(total)
#> # A tibble: 2 × 2
#>   outcome cpu           
#>   <chr>   <drtn>        
#> 1 success 129757.53 days
#> 2 fail     33295.59 days

## CPU-time fractions
ratio <- mutate(total, cpu = { x <- as.numeric(cpu); x / sum(x) })
print(ratio)
#> # A tibble: 2 × 2
#>   outcome   cpu
#>   <chr>   <dbl>
#> 1 success 0.796
#> 2 fail    0.204

From this, we find that during these two weeks, ~20% of the CPU time was consumed by jobs that failed. Among the failed jobs, the failure code was distributed as:

codes <- groups$fail$failed
print(table(codes))
#> codes
#>     1    21    25    26    28    37   100 
#>    91     5 32944  1152    11 46523 27874

From details on these codes, see help("read_sge_accounting", package="wyntonquery"), or:

subset(sge_failed_codes(), Code %in% unique(codes), select=c(Code, Explanation))
#> # A tibble: 9 × 2
#>    Code Explanation                                                                                                       
#>   <int> <chr>
#> 1     1 failed early in execd
#> 2    21 qmaster asked about an unknown job (not in accounting?)
#> 3    25 ran, will be rescheduled
#> 4    26 failed opening stderr/stdout file
#> 5    28 failed changing to start directory
#> 6    37 ran, but killed due to exceeding run time limit
#> 7   100 ran, but killed by a signal (perhaps due to exceeding resources), task …

The jobs that exhausted their run-time limit, consumed 24,485 days of CPU time;

d <- sum(subset(groups$fail, failed == 37)$cpu); units(d) <- "days"
print(d)
#> Time difference of 24484.79 days

which corresponds to ~15% of all CPU time spent:

as.numeric(d)/sum(as.numeric(total$cpu))
#> [1] 0.1501645