123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688
/*******************************************************************************

        copyright:      Copyright (c) 2009 Kris Bell. All rights reserved

        license:        BSD style: $(LICENSE)

        version:        May 2009: Initial release

        since:          0.99.9

        author:         Kris

*******************************************************************************/

module tango.text.Search;

private import Util = tango.text.Util;

/******************************************************************************

        Returns a lightweight pattern matcher, good for short patterns 
        and/or short to medium length content. Brute-force approach with
        fast multi-byte comparisons

******************************************************************************/

FindFruct!(T) find(T) (const(T)[] what)
{
        return FindFruct!(T) (what);
}

/******************************************************************************

        Returns a welterweight pattern matcher, good for long patterns 
        and/or extensive content. Based on the QS algorithm which is a
        Boyer-Moore variant. Does not allocate memory for the alphabet.

        Generally becomes faster as the match-length grows

******************************************************************************/

SearchFruct!(T) search(T) (const(T)[] what)
{
        return SearchFruct!(T) (what);
}

/******************************************************************************

        Convenient bundle of lightweight find utilities, without the
        hassle of IFTI problems. Create one of these using the find() 
        function:
        ---
        auto match = find ("foo");
        auto content = "wumpus foo bar"

        // search in the forward direction
        auto index = match.forward (content);
        assert (index is 7);

        // search again - returns length when no match found
        assert (match.forward(content, index+1) is content.length);
        ---

        Searching operates both forward and backward, with an optional
        start offset (can be more convenient than slicing the content).
        There are methods to replace matches within given content, and 
        others which return foreach() iterators for traversing content.

        SearchFruct is a more sophisticated variant, which operates more
        efficiently on longer matches and/or more extensive content.

******************************************************************************/

private struct FindFruct(T)
{       
        private const(T)[] what;

        /***********************************************************************

                Search forward in the given content, starting at the 
                optional index.

                Returns the index of a match, or content.length where
                no match was located.

        ***********************************************************************/

        size_t forward (const(T)[] content, size_t ofs = 0)
        {
                return Util.index (content, what, ofs);
        }

        /***********************************************************************

                Search backward in the given content, starting at the 
                optional index.

                Returns the index of a match, or content.length where
                no match was located.

        ***********************************************************************/

        size_t reverse (const(T)[] content, size_t ofs = size_t.max)
        {       
                return Util.rindex (content, what, ofs);
        }

        /***********************************************************************

                Return the match text

        ***********************************************************************/

        @property
        const(T)[] match ()
        {
                return what;
        }

        /***********************************************************************

                Reset the text to match

        ***********************************************************************/

        @property
        void match (const(T)[] what)
        {
                this.what = what;
        }

        /***********************************************************************

                Returns true if there is a match within the given content

        ***********************************************************************/

        bool within (const(T)[] content)
        {       
                return forward(content) != content.length;
        }

        /***********************************************************************
                
                Returns number of matches within the given content

        ***********************************************************************/

        size_t count (const(T)[] content)
        {       
                size_t mark, count;

                while ((mark = Util.index (content, what, mark)) != content.length)
                        ++count, ++mark;
                return count;
        }

        /***********************************************************************

                Replace all matches with the given character. Use method
                tokens() instead to avoid heap activity.

                Returns a copy of the content with replacements made

        ***********************************************************************/

        T[] replace (const(T)[] content, T chr)
        {     
                return replace (content, (&chr)[0..1]);  
        }

        /***********************************************************************

                Replace all matches with the given substitution. Use 
                method tokens() instead to avoid heap activity.

                Returns a copy of the content with replacements made

        ***********************************************************************/

        T[] replace (const(T)[] content, const(T)[] sub = null)
        {  
                T[] output;

                foreach (s; tokens (content, sub))
                         output ~= s;
                return output;
        }

        /***********************************************************************

                Returns a foreach() iterator which exposes text segments
                between all matches within the given content. Substitution
                text is also injected in place of each match, and null can
                be used to indicate removal instead:
                ---
                char[] result;

                auto match = find ("foo");
                foreach (token; match.tokens ("$foo&&foo*", "bar"))
                         result ~= token;
                assert (result == "$bar&&bar*");
                ---
                
                This mechanism avoids internal heap activity.                

        ***********************************************************************/

        Util.PatternFruct!(T) tokens (const(T)[] content, const(T)[] sub = null)
        {
                return Util.patterns (content, what, sub);
        }
        
        /***********************************************************************

                Returns a foreach() iterator which exposes the indices of
                all matches within the given content:
                ---
                int count;

                auto f = find ("foo");
                foreach (index; f.indices("$foo&&foo*"))
                         ++count;
                assert (count is 2);
                ---

        ***********************************************************************/

        Indices indices (const(T)[] content)
        {
                return Indices (what, content);
        }
 
        /***********************************************************************

                Simple foreach() iterator

        ***********************************************************************/

        private struct Indices
        {
                const(T)[]     what,
                               content;

                int opApply (scope int delegate (ref size_t index) dg)
                {
                        int    ret;
                        size_t mark;

                        while ((mark = Util.index(content, what, mark)) != content.length)                        
                                if ((ret = dg(mark)) is 0)                                
                                     ++mark;       
                                else
                                   break;                        
                        return ret;   
                }     
        } 
}


/******************************************************************************

        Convenient bundle of welterweight search utilities, without the
        hassle of IFTI problems. Create one of these using the search() 
        function:
        ---
        auto match = search ("foo");
        auto content = "wumpus foo bar"

        // search in the forward direction
        auto index = match.forward (content);
        assert (index is 7);

        // search again - returns length when no match found
        assert (match.forward(content, index+1) is content.length);
        ---

        Searching operates both forward and backward, with an optional
        start offset (can be more convenient than slicing the content).
        There are methods to replace matches within given content, and 
        others which return foreach() iterators for traversing content.

        FindFruct is a simpler variant, which can operate efficiently on 
        short matches and/or short content (employs brute-force strategy)

******************************************************************************/

private struct SearchFruct(T)
{
        private const(T)[]             what;
        private bool                   fore;
        private int[256]               offsets = void;

        /***********************************************************************

                Construct the fruct

        ***********************************************************************/

        static SearchFruct opCall (const(T)[] what) 
        {
                SearchFruct find = void;
                find.match = what;
                return find;
        }
        
        /***********************************************************************

                Return the match text

        ***********************************************************************/

        @property
        const(T)[] match ()
        {
                return what;
        }

        /***********************************************************************

                Reset the text to match

        ***********************************************************************/

        @property
        void match (const(T)[] what)
        {
                offsets[] = cast(int)(what.length + 1);
                this.fore = true;
                this.what = what;
                reset();
        }

        /***********************************************************************

                Search forward in the given content, starting at the 
                optional index.

                Returns the index of a match, or content.length where
                no match was located.

        ***********************************************************************/

        size_t forward (const(T)[] content, size_t ofs = 0) 
        {
                if (! fore)
                      flip();

                if (ofs > content.length)
                    ofs = content.length;

                return find (cast(char*) what.ptr, what.length * T.sizeof, 
                             cast(char*) content.ptr, content.length * T.sizeof, 
                             ofs * T.sizeof) / T.sizeof;
        }

        /***********************************************************************

                Search backward in the given content, starting at the 
                optional index.

                Returns the index of a match, or content.length where
                no match was located.

        ***********************************************************************/

        size_t reverse (const(T)[] content, size_t ofs = size_t.max) 
        {
                if (fore)
                    flip();

                if (ofs > content.length)
                    ofs = content.length;

                return rfind (cast(char*) what.ptr, what.length * T.sizeof, 
                              cast(char*) content.ptr, content.length * T.sizeof, 
                              ofs * T.sizeof) / T.sizeof;
        }

        /***********************************************************************

                Returns true if there is a match within the given content

        ***********************************************************************/

        bool within (const(T)[] content)
        {       
                return forward(content) != content.length;
        }

        /***********************************************************************
                
                Returns number of matches within the given content

        ***********************************************************************/

        size_t count (const(T)[] content)
        {       
                size_t mark, count;

                while ((mark = forward (content, mark)) != content.length)
                        ++count, ++mark;
                return count;
        }

        /***********************************************************************

                Replace all matches with the given character. Use method
                tokens() instead to avoid heap activity.

                Returns a copy of the content with replacements made

        ***********************************************************************/

        T[] replace (const(T)[] content, T chr)
        {     
                return replace (content, (&chr)[0..1]);  
        }

        /***********************************************************************

                Replace all matches with the given substitution. Use 
                method tokens() instead to avoid heap activity.

                Returns a copy of the content with replacements made

        ***********************************************************************/

        T[] replace (const(T)[] content, const(T)[] sub = null)
        {  
                T[] output;

                foreach (s; tokens (content, sub))
                         output ~= s;
                return output;
        }

        /***********************************************************************

                Returns a foreach() iterator which exposes text segments
                between all matches within the given content. Substitution
                text is also injected in place of each match, and null can
                be used to indicate removal instead:
                ---
                char[] result;

                auto match = search ("foo");
                foreach (token; match.tokens("$foo&&foo*", "bar"))
                         result ~= token;
                assert (result == "$bar&&bar*");
                ---
                
                This mechanism avoids internal heap activity             

        ***********************************************************************/

        Substitute tokens (const(T)[] content, const(T)[] sub = null)
        {
                return Substitute (sub, what, content, &forward);
        }
        
        /***********************************************************************

                Returns a foreach() iterator which exposes the indices of
                all matches within the given content:
                ---
                int count;

                auto match = search ("foo");
                foreach (index; match.indices("$foo&&foo*"))
                         ++count;
                assert (count is 2);
                ---

        ***********************************************************************/

        Indices indices (const(T)[] content)
        {
                return Indices (content, &forward);
        }
        
        /***********************************************************************

        ***********************************************************************/

        private size_t find (char* what, size_t wlen, char* content, size_t len, size_t ofs) 
        {
                auto s = content;
                content += ofs;
                auto e = s + len - wlen;
                while (content <= e)
                       if (*what is *content && matches(what, content, wlen))
                           return content - s;
                       else
                          content += offsets [content[wlen]];
                return len;
        }

        /***********************************************************************

        ***********************************************************************/

        private size_t rfind (char* what, size_t wlen, char* content, size_t len, size_t ofs) 
        {
                auto s = content;
                auto e = s + ofs - wlen;
                while (e >= content)
                       if (*what is *e && matches(what, e, wlen))
                           return e - s;
                       else
                          e -= offsets [*(e-1)];
                return len;
        }

        /***********************************************************************

        ***********************************************************************/

        private static bool matches (char* a, char* b, size_t length)
        {
                while (length > size_t.sizeof)
                       if (*cast(size_t*) a is *cast(size_t*) b)
                            a += size_t.sizeof, b += size_t.sizeof, length -= size_t.sizeof;
                       else
                          return false;

                while (length--)
                       if (*a++ != *b++) 
                           return false;
                return true;
        }

        /***********************************************************************

                Construct lookup table. We force the alphabet to be char[]
                always, and consider wider characters to be longer patterns
                instead

        ***********************************************************************/

        private void reset ()
        {
                auto what = cast(char[]) this.what;
                if (fore)   
                    for (int i=0; i < cast(int)what.length; ++i)
                         offsets[what[i]] = cast(int)what.length - i;
                else
                   for (int i=cast(int)what.length; i--;)
                        offsets[what[i]] = cast(int)(i+1);
        }

        /***********************************************************************

                Reverse lookup-table direction

        ***********************************************************************/

        private void flip ()
        {
                fore ^= true;
                reset();
        }

        /***********************************************************************

                Simple foreach() iterator

        ***********************************************************************/

        private struct Indices
        {
                const(T)[]    content;
                size_t delegate(const(T)[], size_t) call;

                int opApply (scope int delegate (ref size_t index) dg)
                {
                        int     ret;
                        size_t  mark;

                        while ((mark = call(content, mark)) != content.length)
                                if ((ret = dg(mark)) is 0)
                                     ++mark;
                                else
                                   break;
                        return ret;   
                }     
        } 

        /***********************************************************************

                Substitution foreach() iterator

        ***********************************************************************/

        private struct Substitute
        {
                private const(T)[] sub, 
                                   what,
                                   content;
                size_t             delegate(const(T)[], size_t) call;

                int opApply (scope int delegate (ref const(T)[] token) dg)
                {
                        int        ret;
                        size_t     pos,
                                   mark;
                        const(T)[] token;

                        while ((pos = call (content, mark)) < content.length)
                              {
                              token = content [mark .. pos];
                              if ((ret = dg(token)) != 0)
                                   return ret;
                              if (sub.ptr && (ret = dg(sub)) != 0)
                                  return ret;
                              mark = pos + what.length;
                              }

                        token = content [mark .. $];
                        if (mark <= content.length)
                            ret = dg (token);
                        return ret;
                }
        }
}




/******************************************************************************

******************************************************************************/

debug (Search)
{
        import tango.io.Stdout;
        import tango.time.StopWatch;

        auto x = import("Search.d");
        
        void main()
        {
                StopWatch elapsed;
        
                auto match = search("foo");
                auto index = match.reverse ("foo foo");
                assert (index is 4);
                index = match.reverse ("foo foo", index);
                assert (index is 0);
                index = match.reverse ("foo foo", 1);
                assert (index is 7);

                foreach (index; find("delegate").indices(x))
                         Stdout.formatln ("< {}", index);

                foreach (index; search("delegate").indices(x))
                         Stdout.formatln ("> {}", index);

                elapsed.start;
                for (auto i=5000; i--;)
                     Util.mismatch (x.ptr, x.ptr, x.length);
                Stdout.formatln ("mismatch {}", elapsed.stop);

                elapsed.start;
                for (auto i=5000; i--;)
                     Util.indexOf (x.ptr, '@', cast(uint) x.length);
                Stdout.formatln ("indexOf {}", elapsed.stop);

                elapsed.start;
                for (auto i=5000; i--;)
                     Util.locatePattern (x, "indexOf {}");
                Stdout.formatln ("pattern {}", elapsed.stop);

                elapsed.start;
                auto f = find ("indexOf {}");
                for (auto i=5000; i--;)
                     f.forward(x);
                Stdout.formatln ("find {}", elapsed.stop);

                elapsed.start;
                auto s = search ("indexOf {}");
                for (auto i=5000; i--;)
                     s.forward(x);
                Stdout.formatln ("search {}", elapsed.stop);
        }
}