summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMark H Weaver <mhw@netris.org>2016-03-09 01:23:53 -0500
committerMark H Weaver <mhw@netris.org>2016-08-09 17:59:26 -0400
commit5a1add373ab427a3b336981d857252e703a9f8d1 (patch)
tree8f8da6f332499c5f4ee6153129917b52a3a4259c
parentba6d25f3b953392136ead2f1ca8af71466da2dae (diff)
grafts: Make grafting faster.
* guix/build/graft.scm (replace-store-references): Reimplement for faster grafting. Use binary I/O instead of textual I/O. Replace 'mapping' argument (an alist) with 'replacement-table' (a vhash). (rewrite-directory): Adapt to mapping argument change in 'replace-store-references'. Remove 'with-fluids' that previously set '%default-port-encoding' to #f, since we now use binary I/O. (define-inline, hash-length): New macros. (nix-base32-char?): New variable.
-rw-r--r--guix/build/graft.scm221
1 files changed, 169 insertions, 52 deletions
diff --git a/guix/build/graft.scm b/guix/build/graft.scm
index fb21fc3af3..f85d485554 100644
--- a/guix/build/graft.scm
+++ b/guix/build/graft.scm
@@ -1,5 +1,6 @@
;;; GNU Guix --- Functional package management for GNU
;;; Copyright © 2014, 2015, 2016 Ludovic Courtès <ludo@gnu.org>
+;;; Copyright © 2016 Mark H Weaver <mhw@netris.org>
;;;
;;; This file is part of GNU Guix.
;;;
@@ -20,8 +21,12 @@
#:use-module (guix build utils)
#:use-module (rnrs bytevectors)
#:use-module (rnrs io ports)
+ #:use-module (ice-9 vlist)
#:use-module (ice-9 match)
#:use-module (ice-9 threads)
+ #:use-module (ice-9 binary-ports)
+ #:use-module (srfi srfi-1) ; list library
+ #:use-module (srfi srfi-26) ; cut and cute
#:export (replace-store-references
rewrite-directory))
@@ -38,50 +43,134 @@
;;;
;;; Code:
-(define* (replace-store-references input output mapping
+(define-syntax-rule (define-inline name val)
+ (define-syntax name (identifier-syntax val)))
+
+(define-inline hash-length 32)
+
+(define nix-base32-char?
+ (cute char-set-contains?
+ ;; ASCII digits and lower case letters except e o t u
+ (string->char-set "0123456789abcdfghijklmnpqrsvwxyz")
+ <>))
+
+(define* (replace-store-references input output replacement-table
#:optional (store (%store-directory)))
- "Read data from INPUT, replacing store references according to MAPPING, and
-writing the result to OUTPUT."
- (define pattern
- (let ((nix-base32-chars
- '(#\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9
- #\a #\b #\c #\d #\f #\g #\h #\i #\j #\k #\l #\m #\n
- #\p #\q #\r #\s #\v #\w #\x #\y #\z)))
- `(,@(map char-set (string->list store))
- ,(char-set #\/)
- ,@(make-list 32 (list->char-set nix-base32-chars))
- ,(char-set #\-))))
-
- ;; We cannot use `regexp-exec' here because it cannot deal with strings
- ;; containing NUL characters, hence 'fold-port-matches'.
- (with-fluids ((%default-port-encoding #f))
- (when (file-port? input)
- (setvbuf input _IOFBF 65536))
- (when (file-port? output)
- (setvbuf output _IOFBF 65536))
-
- (let* ((len (+ 34 (string-length store)))
- (mapping (map (match-lambda
- ((origin . replacement)
- (unless (string=? (string-drop origin len)
- (string-drop replacement len))
- (error "invalid replacement" origin replacement))
- (cons (string-take origin len)
- (string-take replacement len))))
- mapping)))
- (fold-port-matches (lambda (string result)
- (match (assoc-ref mapping string)
- (#f
- (put-bytevector output (string->utf8 string)))
- ((= string->utf8 replacement)
- (put-bytevector output replacement)))
- #t)
- #f
- pattern
- input
- (lambda (char result) ;unmatched
- (put-u8 output (char->integer char))
- result)))))
+ "Read data from INPUT, replacing store references according to
+REPLACEMENT-TABLE, and writing the result to OUTPUT. REPLACEMENT-TABLE is a
+vhash that maps strings (original hashes) to bytevectors (replacement hashes).
+Note: We use string keys to work around the fact that guile-2.0 hashes all
+bytevectors to the same value."
+
+ (define (lookup-replacement s)
+ (match (vhash-assoc s replacement-table)
+ ((origin . replacement)
+ replacement)
+ (#f #f)))
+
+ (define (optimize-u8-predicate pred)
+ (cute vector-ref
+ (list->vector (map pred (iota 256)))
+ <>))
+
+ (define nix-base32-byte?
+ (optimize-u8-predicate
+ (compose nix-base32-char?
+ integer->char)))
+
+ (define (dash? byte) (= byte 45))
+
+ (define request-size (expt 2 20)) ; 1 MiB
+
+ ;; We scan the file for the following 33-byte pattern: 32 bytes of
+ ;; nix-base32 characters followed by a dash. To accommodate large files,
+ ;; we do not read the entire file, but instead work on buffers of up to
+ ;; 'request-size' bytes. To ensure that every 33-byte sequence appears
+ ;; entirely within exactly one buffer, adjacent buffers must overlap,
+ ;; i.e. they must share 32 byte positions. We accomplish this by
+ ;; "ungetting" the last 32 bytes of each buffer before reading the next
+ ;; buffer, unless we know that we've reached the end-of-file.
+ (let ((buffer (make-bytevector request-size)))
+ (let loop ()
+ ;; Note: We avoid 'get-bytevector-n' to work around
+ ;; <http://bugs.gnu.org/17466>.
+ (match (get-bytevector-n! input buffer 0 request-size)
+ ((? eof-object?) 'done)
+ (end
+ ;; We scan the buffer for dashes that might be preceded by a
+ ;; nix-base32 hash. The key optimization here is that whenever we
+ ;; find a NON-nix-base32 character at position 'i', we know that it
+ ;; cannot be part of a hash, so the earliest position where the next
+ ;; hash could start is i+1 with the following dash at position i+33.
+ ;;
+ ;; Since nix-base32 characters comprise only 1/8 of the 256 possible
+ ;; byte values, and exclude some of the most common letters in
+ ;; English text (e t o u), in practice we can advance by 33 positions
+ ;; most of the time.
+ (let scan-from ((i hash-length) (written 0))
+ ;; 'i' is the first position where we look for a dash. 'written'
+ ;; is the number of bytes in the buffer that have already been
+ ;; written.
+ (if (< i end)
+ (let ((byte (bytevector-u8-ref buffer i)))
+ (cond ((and (dash? byte)
+ ;; We've found a dash. Note that we do not know
+ ;; whether the preceeding 32 bytes are nix-base32
+ ;; characters, but we do not need to know. If
+ ;; they are not, the following lookup will fail.
+ (lookup-replacement
+ (string-tabulate (lambda (j)
+ (integer->char
+ (bytevector-u8-ref buffer
+ (+ j (- i hash-length)))))
+ hash-length)))
+ => (lambda (replacement)
+ ;; We've found a hash that needs to be replaced.
+ ;; First, write out all bytes preceding the hash
+ ;; that have not yet been written.
+ (put-bytevector output buffer written
+ (- i hash-length written))
+ ;; Now write the replacement hash.
+ (put-bytevector output replacement)
+ ;; Since the byte at position 'i' is a dash,
+ ;; which is not a nix-base32 char, the earliest
+ ;; position where the next hash might start is
+ ;; i+1, and the earliest position where the
+ ;; following dash might start is (+ i 1
+ ;; hash-length). Also, we have now written up to
+ ;; position 'i' in the buffer.
+ (scan-from (+ i 1 hash-length) i)))
+ ;; If the byte at position 'i' is a nix-base32 char,
+ ;; then the dash we're looking for might be as early as
+ ;; the following byte, so we can only advance by 1.
+ ((nix-base32-byte? byte)
+ (scan-from (+ i 1) written))
+ ;; If the byte at position 'i' is NOT a nix-base32
+ ;; char, then the earliest position where the next hash
+ ;; might start is i+1, with the following dash at
+ ;; position (+ i 1 hash-length).
+ (else
+ (scan-from (+ i 1 hash-length) written))))
+
+ ;; We have finished scanning the buffer. Now we determine how
+ ;; many bytes have not yet been written, and how many bytes to
+ ;; "unget". If 'end' is less than 'request-size' then we read
+ ;; less than we asked for, which indicates that we are at EOF,
+ ;; so we needn't unget anything. Otherwise, we unget up to
+ ;; 'hash-length' bytes (32 bytes). However, we must be careful
+ ;; not to unget bytes that have already been written, because
+ ;; that would cause them to be written again from the next
+ ;; buffer. In practice, this case occurs when a replacement is
+ ;; made near the end of the buffer.
+ (let* ((unwritten (- end written))
+ (unget-size (if (= end request-size)
+ (min hash-length unwritten)
+ 0))
+ (write-size (- unwritten unget-size)))
+ (put-bytevector output buffer written write-size)
+ (unget-bytevector input buffer (+ written write-size)
+ unget-size)
+ (loop)))))))))
(define (rename-matching-files directory mapping)
"Apply MAPPING to the names of all the files in DIRECTORY, where MAPPING is
@@ -122,6 +211,35 @@ an exception is caught."
#:optional (store (%store-directory)))
"Copy DIRECTORY to OUTPUT, replacing strings according to MAPPING, a list of
file name pairs."
+
+ (define hash-mapping
+ (let* ((prefix (string-append store "/"))
+ (start (string-length prefix))
+ (end (+ start hash-length)))
+ (define (valid-hash? h)
+ (every nix-base32-char? (string->list h)))
+ (define (valid-suffix? s)
+ (string-prefix? "-" s))
+ (define (hash+suffix s)
+ (and (< end (string-length s))
+ (let ((hash (substring s start end))
+ (suffix (substring s end)))
+ (and (string-prefix? prefix s)
+ (valid-hash? hash)
+ (valid-suffix? suffix)
+ (list hash suffix)))))
+ (map (match-lambda
+ (((= hash+suffix (origin-hash suffix))
+ .
+ (= hash+suffix (replacement-hash suffix)))
+ (cons origin-hash (string->utf8 replacement-hash)))
+ ((origin . replacement)
+ (error "invalid replacement" origin replacement)))
+ mapping)))
+
+ (define replacement-table
+ (alist->vhash hash-mapping))
+
(define prefix-len
(string-length directory))
@@ -138,18 +256,17 @@ file name pairs."
(symlink (call-with-output-string
(lambda (output)
(replace-store-references (open-input-string target)
- output mapping
+ output replacement-table
store)))
dest)))
((regular)
- (with-fluids ((%default-port-encoding #f))
- (call-with-input-file file
- (lambda (input)
- (call-with-output-file dest
- (lambda (output)
- (replace-store-references input output mapping
- store)
- (chmod output (stat:perms stat))))))))
+ (call-with-input-file file
+ (lambda (input)
+ (call-with-output-file dest
+ (lambda (output)
+ (replace-store-references input output replacement-table
+ store)
+ (chmod output (stat:perms stat)))))))
((directory)
(mkdir-p dest))
(else