-
Notifications
You must be signed in to change notification settings - Fork 53
/
string-search-impl.rkt
191 lines (177 loc) · 7.48 KB
/
string-search-impl.rkt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
#lang racket/base
;; This file is private to string-search.rkt.
;; Applications should include string-search.rkt instead.
(provide
string/searchable
suffix:corpus-find*/disk
bytes->suffix-key
suffix-key->bytes
param-fd-input-binary
ensure-fd-input-binary
test:parameterize-defaults
;; The following are internals used in string-search.rkt
suffix<?/corpus2
concept-name
write-scm
write-suffix-keys
)
(require
racket/list
(except-in racket/match ==)
racket/string
racket/unsafe/ops
racket/vector
racket/dict
"base.rkt"
)
;; adapted from medikanren/repr.rkt
(define (concept-cui c) (list-ref c 0))
(define (concept-name c) (list-ref c 1))
;; BEGIN: excerpted from medikanren/repr.rkt
(define (byte-at offset n) (bitwise-and 255 (arithmetic-shift n offset)))
;; ...
(define string-key-byte-size 6)
(define suffix-key-byte-size (+ 6 2))
(define (suffix-key-count bs) (/ (bytes-length bs) suffix-key-byte-size))
(define (suffix-key-count/port in)
(file-position in eof)
(/ (file-position in) suffix-key-byte-size))
(define (suffix-key-ref index i) (bytes->suffix-key index i))
(define (suffix-key->bytes s)
(define cid (car s))
(define pos (cdr s))
(bytes (byte-at -40 cid) (byte-at -32 cid)
(byte-at -24 cid) (byte-at -16 cid) (byte-at -8 cid) (byte-at 0 cid)
(byte-at -8 pos) (byte-at 0 pos)))
(define (bytes->suffix-key bs start)
(define i (* start suffix-key-byte-size))
(define (bref-to j offset) (arithmetic-shift (bytes-ref bs (+ i j)) offset))
(cons (+ (bref-to 0 40) (bref-to 1 32) (bref-to 2 24) (bref-to 3 16) (bref-to 4 8) (bref-to 5 0))
(+ (bref-to 6 8) (bref-to 7 0))))
(define (read-suffix-key-bytes in) (read-bytes suffix-key-byte-size in))
(define (write-suffix-keys out v)
(for ((s (in-vector v))) (write-bytes (suffix-key->bytes s) out)))
(define (suffix-index->suffix-key in si)
(file-position in (* suffix-key-byte-size si))
(bytes->suffix-key (read-suffix-key-bytes in) 0))
;; ...
(define (write-scm out scm) (fprintf out "~s\n" scm))
;; END:excerpted from medikanren/repr.rkt
(define (nlist-intersection nlists)
(if (null? nlists) '()
(let loop ((i** nlists))
(cond ((ormap null? i**) '())
(else (define i0* (map car i**))
(define next (apply max i0*))
(if (andmap (lambda (i) (= i next)) i0*)
(cons next (loop (map cdr i**)))
(loop (map (lambda (i*) (dropf i* (lambda (i) (< i next))))
i**))))))))
(define i0 (char->integer #\0))
(define i9 (char->integer #\9))
(define iA (char->integer #\A))
(define iZ (char->integer #\Z))
(define (searchable? c) (and (<= i0 c) (<= c iZ) (or (<= c i9) (<= iA c))))
(define (string/searchable s)
(define cs (map char->integer (string->list (string-upcase s))))
(list->string (map integer->char (filter searchable? cs))))
(define (string<?/suffixes a ai b bi)
(define alen (- (string-length a) ai))
(define blen (- (string-length b) bi))
(let loop ((k (min alen blen)) (ai ai) (bi bi))
(cond ((= k 0) (< alen blen))
((char<? (string-ref a ai) (string-ref b bi)) #t)
((char>? (string-ref a ai) (string-ref b bi)) #f)
(else (loop (- k 1) (+ ai 1) (+ bi 1))))))
(define (suffix<?/corpus2 hashcorpus bin-a bin-b)
(let* ((a (bytes->suffix-key bin-a 0))
(b (bytes->suffix-key bin-b 0))
(c (hash-ref hashcorpus (car a)))
(d (hash-ref hashcorpus (car b))))
(string<?/suffixes c (cdr a) d (cdr b))))
(define (remove-adjacent-duplicates xs)
(define (remove/x x xs)
(cons x (let loop ((xs xs))
(cond ((null? xs) '())
((equal? (car xs) x) (loop (cdr xs)))
(else (remove/x (car xs) (cdr xs)))))))
(if (null? xs) '() (remove/x (car xs) (cdr xs))))
(define (dedup/< ns) (remove-adjacent-duplicates (sort ns <)))
(define (suffix:corpus-find-range/disk cid->concept in-index str)
(define needle (string/searchable str))
(define (compare si needle)
(match-define (cons cid pos) (suffix-index->suffix-key in-index si))
(define hay (substring (string/searchable (concept-name (cid->concept cid))) pos))
(cond ((string-prefix? hay needle) 0)
((string<? hay needle) -1)
(else 1)))
;; Find a point in the desired range...
(let find-range ((start 0) (end (suffix-key-count/port in-index)))
(cond ((< start end)
(define mid (+ start (quotient (- end start) 2)))
(case (compare mid needle)
((-1) (find-range (+ 1 mid) end))
(( 1) (find-range start mid))
(( 0) ;; ... then find the start and end of that range.
(define rstart
(let loop ((start start) (end mid))
(cond ((< start end)
(define mid (+ start (quotient (- end start) 2)))
(case (compare mid needle)
((-1) (loop (+ 1 mid) end))
(( 0) (loop start mid))
(else (error "rstart: this shouldn't happen."))))
(else end))))
(define rend
(let loop ((start (+ 1 mid)) (end end))
(cond ((< start end)
(define mid (+ start (quotient (- end start) 2)))
(case (compare mid needle)
((1) (loop start mid))
((0) (loop (+ 1 mid) end))
(else (error "rend: this shouldn't happen."))))
(else end))))
(cons rstart rend))))
(else (cons start end)))))
(define (suffix:corpus-find*/disk cid->concept in-index str*)
(define (rz r) (- (cdr r) (car r)))
(define rs (map (lambda (s) (suffix:corpus-find-range/disk cid->concept in-index s)) str*))
(define zmin (* 2 (if (null? rs) 0 (foldl (lambda (r z) (min z (rz r)))
(rz (car rs)) (cdr rs)))))
(nlist-intersection
(map (lambda (r) (dedup/< (map (lambda (i) (car (suffix-index->suffix-key in-index i)))
(range (car r) (cdr r)))))
(filter (lambda (r) (<= (rz r) zmin)) rs))))
;;;; Provisions for efficient safe use of file handles
;;; param-fd-input-binary:
;;; A context for caching file descriptors.
;;; To preserve context, (make-hash), keep a reference, and pass
;;; via parameterize.
(define param-fd-input-binary
(make-parameter
'initialize-me-via-make-hash
#f))
;;; ensure-fd-input-binary:
;;; Provide the cached file handle for fn, or open a file handle
;;; if one hasn't been created.
(define (ensure-fd-input-binary absf)
(define (ensure-fd h)
(dict-ref! h absf (lambda () (open-input-file absf #:mode 'binary))))
(define h (param-fd-input-binary))
(if (dict? h)
(ensure-fd h)
(let* (
(h (make-hash))
(fd (ensure-fd h)))
(param-fd-input-binary h)
fd)))
;;; test:parameterize-defaults
;;; A helper for automated tests to be able to open string indexes.
;;; Applications should implement a similar but usually more complicated
;;; function. Call (make-hash) as we do here to trigger allocation of a
;;; new hash of file handles, or supply a previously allocated hash
;;; of file handles for reuse.
(define (test:parameterize-defaults thunk)
(parameterize
((param-fd-input-binary (make-hash)))
(thunk)))