1
0
Fork 0
factor-puzzles/rosalind/lcsm/lcsm.factor

54 lines
1.7 KiB
Factor

! Copyright (C) 2020 Bubbler.
! See http://factorcode.org/license.txt for BSD license.
USING: arrays assocs combinators fry io io.encodings.utf8
io.files kernel math rosalind.common sequences sequences.extras
;
IN: rosalind.lcsm
: dna>enum ( string -- array ) >array "$ACGT" { } zip-index-as substitute 0 suffix ;
! deeply nested array where every node is a fixed-length array
! branches with zero entries are f
: trie-init ( -- trie ) 5 f <array> ;
! in-place operation
: trie-add ( trie string -- )
[ ! ( trie char -- trie )
[ swap nth ]
[ ! ( nth/f trie char -- trie )
rot [ 2nip ] [ trie-init [ spin set-nth ] [ ] bi ] if*
] 2bi
] each drop ;
: trie-longest-prefix ( trie string -- string )
[ ! ( trie char -- trie'/f trie'/f )
swap nth dup
] take-while nip ;
: suffix-trie ( string -- trie )
dup '[ nip _ swap tail-slice ] map-index
trie-init [ [ trie-add ] keepd ] reduce ;
: trie-intersect-string ( trie string -- trie )
dup '[ nip _ swap tail-slice ] map-index
trie-init [ ! ( prevtrie trie string -- prevtrie trie )
overd trie-longest-prefix [ zero? ] trim-tail 0 suffix [ trie-add ] keepd
] reduce nip ;
: trie-strings ( trie -- seq )
{ { f [ { } ] }
{ { f f f f f } [ { { } } ] }
[ [ [ trie-strings ] dip '[ _ prefix ] map ] map-index concat ] } case ;
: enum>dna ( array -- string ) [ zero? ] trim-tail "$ACGT" nths ;
: intersect-dnas ( dnas -- intersections )
[ dna>enum ] map unclip suffix-trie [ trie-intersect-string ] reduce
trie-strings [ enum>dna ] map ;
: lcsm ( dnas -- dna ) intersect-dnas longest ;
: lcsm-main ( -- ) "datasets/rosalind/lcsm.txt" utf8 file-contents fasta>assoc values lcsm print ;
MAIN: lcsm-main