(define (make-extended-ngram-table winners losers) (let* ((initial-ngrams (generate-ngrams winners losers))) (write-string "Initial ngrams: ") (write (length initial-ngrams)) (newline) (map (lambda (winner) (cons winner (keep-matching-items initial-ngrams (lambda (ngram) (re-string-search-forward ngram winner))))) winners))) (define (generate-ngrams winners losers) (write-string "Generating ngrams...")(newline) (let ((losing-ngram? (string-list-matcher losers))) (fold-left (lambda (answer winner) (lset-union equal? answer (extended-ngrams losing-ngram? winner))) '() winners))) (define (string-list-matcher string-list) (lambda (test-ngram) (there-exists? string-list (lambda (string) (re-string-search-forward test-ngram string))))) (define *dotification-limit* 4) (define *generate-ends-of-words* #t) (define *generate-dotted* #t) (define (ngrams-of-length n string) (do ((start 0 (1+ start)) (end n (1+ end)) (answer '() (lset-adjoin string=? answer (substring string start end)))) ((> end (string-length string)) answer))) (define (generate-dotted answer losing-ngram?) (do ((tail answer (cdr tail)) (answer '() (let ((item (car tail))) (fold-left (lambda (answer dotted) (if (losing-ngram? dotted) answer (lset-adjoin string=? answer dotted))) answer (dotify item))))) ((not (pair? tail)) (if (null? tail) answer (improper-list-error 'generate-dotted tail))))) (define (dotify word) (cond ((string=? word "") (list "")) ((> (string-length word) *dotification-limit*) (list word)) (else (fold-left (lambda (answer dotified) (fold-left (lambda (answer replacement) (lset-adjoin equal? answer (string-append replacement dotified))) answer (replacements (substring word 0 1)))) '() (dotify (substring word 1 (string-length word))))))) (define (replacements string) (if (or (string=? string "^") (string=? string "$")) (list string) (list string "."))) (define (extended-ngrams losing-ngram? string) (let ((string (if *generate-ends-of-words* (string-append "^" string "$") string))) (do ((n 1 (+ n 1)) (answer '() (lset-union string=? answer (delete-matching-items (ngrams-of-length n string) losing-ngram?)))) ((> n (string-length string)) (if *generate-dotted* (generate-dotted answer losing-ngram?) answer)))))Adding the dotification greatly increases the number of ways to match words:
1 ]=> (extended-ngrams (string-list-matcher losers) "lincoln") ;Value 15: ("li" "ln" "ln$" "oln" ".ln" "col" "lin" "li." "^li" "o.n$" "oln$" ".ln$" "col." "c.ln" "..ln" "coln" ".oln" "co.n" "n.ol" "..ol" "ncol" ".col" "nc.l" "i.co" "inco" "i..o" "in.o" "lin." "li.." "l.nc" "linc" "l..c" "li.c" "^li." "^lin" "coln$" "ncoln" "incol" "linco" "^linc" "ncoln$" "incoln" "lincol" "^linco" "incoln$" "lincoln" "^lincol" "lincoln$" "^lincoln" "^lincoln$")The table that maps words to their extended ngrams is quite large, but it can be reduced in size without affecting the solution to the set cover problem. If two regexps match exactly the same set of winning strings, then one can be substituted for the other in any solution, so we can discard all but the shortest of these. If a regexp matches a proper superset of another regexp, and the other regexp is at least the same length or longer, then the first regexp dominates the second one, so we can discard the second one.
(define (minimize-keys value->keys-table better-solution) (let* ((all-keys (get-keys value->keys-table)) (equivalents (collect-equivalent-partial-solutions value->keys-table (map list all-keys))) (reduced (map (lambda (equivalent) (cons (car equivalent) (car (least-elements (cdr equivalent) better-solution)))) equivalents)) (dominants (collect-dominant-partial-solutions reduced better-solution)) (good-keys (fold-left (lambda (answer candidate) (lset-adjoin equal? answer (cadr candidate))) '() dominants))) (define (rebuild-entry entry) (cons (car entry) (keep-matching-items (cdr entry) (lambda (item) (member item good-keys))))) (write-string "Deleting ") (write (- (length all-keys) (length good-keys))) (write-string " of ") (write (length all-keys)) (write-string " keys. ") (write (length good-keys)) (write-string " keys remain.")(newline) (map rebuild-entry value->keys-table))) (define (partial-solution-matches value->keys-table partial-solution) (keep-matching-items value->keys-table (lambda (entry) (there-exists? partial-solution (lambda (key) (member key (cdr entry))))))) (define (collect-equivalent-partial-solutions value->keys-table partial-solutions) (let ((answer-table (make-equal-hash-table))) (for-each (lambda (partial-solution) (hash-table/modify! answer-table (map car (partial-solution-matches value->keys-table partial-solution)) (list) (lambda (other) (lset-adjoin equal? other partial-solution)))) partial-solutions) (hash-table->alist answer-table))) (define (collect-dominant-partial-solutions equivalents better-solution) (define (dominates? left right) (and (superset? (car left) (car right)) (not (better-solution (cdr right) (cdr left))))) (let ((sorted (sort equivalents (lambda (l r) (> (length (car l)) (length (car r))))))) (fold-left (lambda (answer candidate) (if (there-exists? answer (lambda (a) (dominates? a candidate))) answer (lset-adjoin equal? answer candidate))) '() sorted)))We can minimize the value->key-table in another way. If two values in the table are matched by the exact same set of keys, then we can delete one without changing the solution. If a value is matched by a small set of keys, and if another values is matched by a superset of these keys, then we can delete the larger one because if the smaller one matches, the larger one must match as well.
(define (minimize-values v-k-table) (let ((size-before (length v-k-table))) (define (dominated-value? entry) (let ((entry-value (car entry)) (entry-keylist (cdr entry))) (there-exists? v-k-table (lambda (other-entry) (and (not (eq? entry other-entry)) (let ((other-value (car other-entry)) (other-keylist (cdr other-entry))) (let ((result (and (superset? entry-keylist other-keylist) (not (superset? other-keylist entry-keylist))))) (if result (begin (display "Removing ") (write entry-value) (display " dominated by ") (write other-value) (display ".") (newline) )) result))))))) (define (equivalent-value-in-answer? answer entry) (let ((entry-value (car entry)) (entry-keylist (cdr entry))) (there-exists? answer (lambda (other-entry) (let ((other-value (car other-entry)) (other-keylist (cdr other-entry))) (let ((result (equal? entry-keylist other-keylist))) (if result (begin (display "Removing ") (write entry-value) (display " equivalent to ") (write other-value) (display ".") (newline) )) result)))))) (define (add-entry answer entry) (if (or (equivalent-value-in-answer? answer entry) (dominated-value? entry)) answer (cons entry answer))) (let ((answer (fold-left add-entry '() v-k-table))) (write-string "Removed ") (write (- size-before (length answer))) (write-string " dominated and equivalent values.") (newline) answer)))Each time we remove values or keys, we might make more keys and values equivalent or dominated, so we iterate until we can no longer remove anything.
(define (minimize-vktable value->keys-table better-solution) (let* ((before-size (fold-left + 0 (map length value->keys-table))) (new-table (minimize-values (minimize-keys value->keys-table better-solution))) (after-size (fold-left + 0 (map length new-table)))) (if (= before-size after-size) value->keys-table (minimize-vktable new-table better-solution))))The minimized table for the presidents looks like this:
(("washington" "sh" "g..n" "n..o" ".h.n" "a..i") ("adams" "a.a" "am" "ad") ("madison" "m..i" "i..n" "is." "i.o" "di" "ma" "ad") ("monroe" "r.e$" "oe") ("van-buren" "u..n" "r.n" ".b" "bu" "-") ("harrison" "r..s" "r.i" "i..n" "is." "i.o" "a..i") ("polk" "po") ("taylor" "ay." "ta") ("pierce" "ie." "rc" "r.e$") ("buchanan" "bu" "a.a" ".h.n") ("lincoln" "i..o" "li") ("grant" "an.$" "a.t" "ra" "r.n" "g..n") ("hayes" "h..e" "ye" "ay.") ("garfield" "el.$" "i.l" "ga" "ie." "r.i" ".f" "a..i") ("cleveland" "v.l" "an.$") ("mckinley" "n.e" "nl" "i.l" "m..i") ("roosevelt" ".se" "oo" "v.l" "el.$" "r..s") ("taft" "a.t" "ta" ".f") ("wilson" "ls" "i..o") ("harding" "r.i" "di" "a..i") ("coolidge" "oo" "li") ("hoover" "ho" "oo") ("truman" "u..n" "ma") ("eisenhower" "ho" ".se" "h..e" "i..n" "is.") ("kennedy" "nn" "n.e") ("johnson" "j") ("nixon" "^n" "i..n" "i.o" "n..o") ("carter" "rt" "a.t") ("reagan" "ga" "a.a") ("bush" "bu" "sh") ("obama" ".b" "ma" "a.a" "am"))As you can see, we have reduced the original 2091 matching regexps to fifty.
Changes to the set-cover code coming soon....
No comments:
Post a Comment