From c26dfad78d35793047b11ac435b1f8e40e072f94 Mon Sep 17 00:00:00 2001 From: Michael Raitza Date: Sun, 31 Jan 2021 22:56:09 +0100 Subject: [PATCH] Fix byte sequence difference algorithm --- trees/cb/cb-docs.factor | 34 +++++++++++++++---------------- trees/cb/cb-test.factor | 24 ---------------------- trees/cb/cb-tests.factor | 17 ++++++++++++++++ trees/cb/cb.factor | 44 ++++++++++++++++++---------------------- trees/cb/summary.txt | 3 ++- 5 files changed, 56 insertions(+), 66 deletions(-) delete mode 100644 trees/cb/cb-test.factor create mode 100644 trees/cb/cb-tests.factor diff --git a/trees/cb/cb-docs.factor b/trees/cb/cb-docs.factor index 1b18bea..ff6e702 100644 --- a/trees/cb/cb-docs.factor +++ b/trees/cb/cb-docs.factor @@ -2,6 +2,22 @@ USING: arrays assocs byte-arrays help.markup help.syntax io.encodings.utf8 kernel math serialize trees.cb.private ; IN: trees.cb +ARTICLE: "trees.cb" "Binary crit-bit trees" +"The " { $vocab-link "trees.cb" } " vocabulary is a library for binary critical bit trees, a variant of PATRICIA tries. A crit-bit tree stores each element of a non-empty set of keys " { $snippet "K" } " in a leaf node. Each leaf node is attached to the tree of internal split nodes for bit strings " { $snippet "x" } " such that " { $snippet "x0" } " and " { $snippet "x1" } " are prefixes of (serialized byte arrays of) elements in " { $snippet "K" } " and ancestors of other bit strings higher up in the tree. Split nodes store the prefix compressed as two values, the byte number and bit position, in the subset of " { $snippet "K" } " at which the prefixes of all ancestors to the left differ from all ancestors to the right." +$nl +"Serialization of keys is implemented using " { $link key>bytes } ". Crit-bit trees can store arbitrary keys and values, even mixed (but see implementation notes to " { $link key>bytes* } "). Due to the nature of crit-bit trees, for any given input key set that shares a common prefix, the tree compresses the common prefix into the split node at the joint extending the lookup by one node for arbitrarily long prefixes." +$nl +"Keys are serialized once for every lookup and insertion not adding a new leaf node. Two keys are serialized for every insertion adding a new leaf node to the tree." +$nl +"Due to ordering ancestors at split nodes into crit-bit '0' (left) and crit-bit '1' (right), the order of the elements in a crit-bit tree is total allowing efficient suffix searches and minimum searches." +$nl +"Crit-bit trees consume 2 * " { $emphasis "n" } " - 1 nodes in total for storing " { $emphasis "n" } " elements; each internal split node consumes two pointers and a fixnum and an integer; each leaf node two pointers to the key and value. Their shape is unique for any given set of keys, which also means lookup times are deterministic for a known set of keys regardless of insertion order or the tree having been cloned." +$nl +"Compared to hash tables, crit-bit trees provide fast access without being prone to malicious input (but see limitations of the standard implementation of " { $link key>bytes* } ") and also provide ordered operations (e.g. finding minimums). Compared to heaps, they support exact searches and suffix searches in addition. Compared to other ordered trees (AVL, B-), they support the same set of operations while keeping a simpler inner structure." +$nl +"Crit-bit trees conform to the assoc protocol." +; + HELP: CB{ { $syntax "CB{ { key value }... }" } { $values { "key" "a key" } { "value" "a value" } } @@ -22,20 +38,4 @@ HELP: key>bytes* { $values { "key" object } { "bytes" byte-array } } { $description "Converts a key, which can be any " { $link object } ", into a " { $link byte-array } ". Standard methods convert strings into its " { $link utf8 } " byte sequences and " { $link float } " values into byte arrays representing machine-specific doubles. Integrals are converted into a byte sequence of at least machine word size in little endian byte order." $nl -"All other objects are serialized using " { $link object>bytes } ". In the standard implementation, this maps " { $link f } " to the byte array " { $snippet "B{ 110 }" } " and " { $link t } " to " { $snippet "B{ 116 }" } ", which is identical to the respective integers." } ; - -ARTICLE: "trees.cb" "Binary crit-bit trees" -"The " { $vocab-link "trees.cb" } " vocabulary is a library for binary critical bit trees, a variant of PATRICIA tries. A crit-bit tree stores each element of a non-empty set of keys " { $snippet "K" } " in a leaf node. Each leaf node is attached to the tree of internal split nodes for bit strings " { $snippet "x" } " such that " { $snippet "x0" } " and " { $snippet "x1" } " are prefixes of (serialized byte arrays of) elements in " { $snippet "K" } " and ancestors of other bit strings higher up in the tree. Split nodes store the prefix compressed as two values, the byte number and bit position, in the subset of " { $snippet "K" } " at which the prefixes of all ancestors to the left differ from all ancestors to the right." -$nl -"Serialization of keys is implemented using " { $link key>bytes } ". Crit-bit trees can store arbitrary keys and values, even mixed (but see implementation notes to " { $link key>bytes* } "). Due to the nature of crit-bit trees, for any given input key set that shares a common prefix, the tree compresses the common prefix into the split node at the root extending the lookup by one for arbitrary long prefixes." -$nl -"Keys are serialized once for every lookup and insertion not adding a new leaf node. Two keys are serialized for every insertion adding a new leaf node to the tree." -$nl -"Due to ordering ancestors at split nodes into crit-bit '0' (left) and crit-bit '1' (right), the order of the elements in a crit-bit tree is total allowing efficient suffix searches and minimum searches." -$nl -"Crit-bit trees consume 2 * " { $emphasis "n" } " - 1 nodes in total for storing " { $emphasis "n" } " elements; each internal split node consumes two pointers and two fixnums; each leaf node two pointers to the key and value. Their shape is unique for any given set of keys, which also means lookup times are deterministic for a known set of keys regardless of insertion order or the tree having been cloned." -$nl -"Compared to hash tables, crit-bit trees provide fast access without being prone to malicious input (but see limitations of the standard implementation of " { $link key>bytes* } ") and also provide ordered operations (e.g. finding minimums). Compared to heaps, they support exact searches and suffix searches in addition. Compared to other ordered trees (AVL, B-), they support the same set of operations while keeping a simpler inner structure." -$nl -"Crit-bit trees conform to the assoc protocol." -; +"All other objects are serialized using " { $link object>bytes } ". In the standard implementation, this maps " { $link f } " to the byte array " { $snippet "B{ 110 }" } " and " { $link t } " to " { $snippet "B{ 116 }" } ", which is identical to using the respective literal byte arrays as inputs." } ; diff --git a/trees/cb/cb-test.factor b/trees/cb/cb-test.factor deleted file mode 100644 index e584c9d..0000000 --- a/trees/cb/cb-test.factor +++ /dev/null @@ -1,24 +0,0 @@ -USING: assocs kernel tools.test trees trees.cb trees.private ; -IN: trees.cb.tests - -! Insertion into empty tree -{ T{ cb { root T{ node { key 0 } { value 0 } } } { count 1 } } } [ - 0 0 [ set-at ] keep -] unit-test - -! Insertion into a leaf-node resulting in splitting -{ - T{ cb - { root - T{ cb-node - { bits 247 } - { left T{ node { key 1 } { value 1 } } } - { right T{ node { key 0 } { value 0 } } } - } - } - { count 2 } - } -} [ - 0 0 [ set-at ] keep - 1 1 rot [ set-at ] keep -] unit-test diff --git a/trees/cb/cb-tests.factor b/trees/cb/cb-tests.factor new file mode 100644 index 0000000..8b3ae46 --- /dev/null +++ b/trees/cb/cb-tests.factor @@ -0,0 +1,17 @@ +USING: assocs kernel tools.test trees trees.cb trees.cb.private trees.private ; +IN: trees.cb.tests + +CONSTANT: 4tree CB{ { 0 0 } { 1 1 } { 2 2 } { 3 3 } } + +! Insertion into an empty tree +{ CB{ { 0 0 } } } [ + 0 0 [ set-at ] keep +] unit-test + +! Insertion into a leaf-node resulting in splitting +{ + CB{ { 0 0 } { 1 1 } } +} [ + 0 0 [ set-at ] keep + 1 1 rot [ set-at ] keep +] unit-test diff --git a/trees/cb/cb.factor b/trees/cb/cb.factor index 2147d92..0e724f5 100644 --- a/trees/cb/cb.factor +++ b/trees/cb/cb.factor @@ -16,9 +16,9 @@ USING: accessors alien arrays assocs byte-arrays combinators combinators.short-circuit fry io.binary io.encodings.binary io.encodings.private -io.encodings.string io.encodings.utf8 kernel layouts locals make math -math.private namespaces parser prettyprint.custom sequences serialize strings -trees trees.private vectors ; +io.encodings.string io.encodings.utf8 kernel layouts locals make math math.order +math.private namespaces parser prettyprint.custom sequences sequences.private +serialize strings trees trees.private vectors ; IN: trees.cb TUPLE: cb < tree ; @@ -27,7 +27,7 @@ TUPLE: cb < tree ; ] [ 2nip [ swap nth ] keep ] } - { [ 2dup < ] [ drop [ drop ] 2dip [ swap nth ] keep ] } - [ 4drop 0 f ] - } cond ; +: nth0 ( n seq -- elt/0 ) + ?nth [ 0 ] unless* ; -: order-by-length ( seq1 seq2 -- seq-short seq-long ) - 2dup [ length ] bi@ > [ swap ] when ; +: 2nth0 ( n seq1 seq2 -- elt1/0 elt2/0 ) + [ nth0 ] bi-curry@ bi ; ! For two byte strings, calculate the critical bit, byte and direction of -! difference. -: (bytes-diff) ( newbytes oldbytes -- side bits byte# ) +! difference. For meaningful results ensure that newbytes ≠ oldbytes +: bytes-diff ( newbytes oldbytes -- side bits byte# ) 2dup mismatch [ - [ '[ _ swap nth ] bi@ byte-diff ] keep + [ -rot 2nth-unsafe byte-diff ] keep ] [ - ! Equal prefix over full (shorter) byte sequence. - elt-from-long-seq [ [ 0 ] dip ] [ ] if* ; - [ 1 255 ] 2dip shorter length 1 - + ! [ [ length ] bi@ = ] 2keep rot + ! [ 2drop 0 0 f ] + ! [ + [ min-length dup ] 2keep + 2nth0 byte-diff rot + ! ] if ] if* ; -: bytes-diff ( newbytes oldbytes -- side bits byte#/f ) - bytes-diff ; - PRIVATE> GENERIC: key>bytes* ( key -- bytes ) @@ -116,7 +112,7 @@ SYMBOL: new-side ! Extract the critical byte : byte-at ( byte# -- byte/0 ) - key-bytes get ?nth [ 0 ] unless* ; + key-bytes get nth0 ; ! For the current key and cb-node determin which side to go next : select-side ( node -- node side ) @@ -159,7 +155,7 @@ M: f cb-update ! or create a new split node and attach a fresh leaf node with the new key and ! value. M: node cb-update - dup key>> current-key get = [ + dup key>> key>bytes key-bytes get = [ current-key get >>key swap >>value f ] [ @@ -303,7 +299,7 @@ SYNTAX: CB{ M: cb assoc-like drop dup cb? [ >cb ] unless ; M: cb pprint-delims drop \ CB{ \ } ; -M: cb >pprint-sequence >alist ; +M: cb >pprint-sequence >cb-alist ; M: cb pprint-narrow? drop t ; PRIVATE> diff --git a/trees/cb/summary.txt b/trees/cb/summary.txt index 827a94e..1ba030c 100644 --- a/trees/cb/summary.txt +++ b/trees/cb/summary.txt @@ -1 +1,2 @@ -Critical bit trees as described in http://cr.yp.to/critbit.html +Critical bit trees as described in http://cr.yp.to/critbit.html. +They are implemented as subclasses of trees.