| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688 | /******************************************************************************* copyright: Copyright (c) 2009 Kris Bell. All rights reserved license: BSD style: $(LICENSE) version: May 2009: Initial release since: 0.99.9 author: Kris *******************************************************************************/ module tango.text.Search; private import Util = tango.text.Util; /****************************************************************************** Returns a lightweight pattern matcher, good for short patterns and/or short to medium length content. Brute-force approach with fast multi-byte comparisons ******************************************************************************/ FindFruct!(T) find(T) (const(T)[] what) { return FindFruct!(T) (what); } /****************************************************************************** Returns a welterweight pattern matcher, good for long patterns and/or extensive content. Based on the QS algorithm which is a Boyer-Moore variant. Does not allocate memory for the alphabet. Generally becomes faster as the match-length grows ******************************************************************************/ SearchFruct!(T) search(T) (const(T)[] what) { return SearchFruct!(T) (what); } /****************************************************************************** Convenient bundle of lightweight find utilities, without the hassle of IFTI problems. Create one of these using the find() function: --- auto match = find ("foo"); auto content = "wumpus foo bar" // search in the forward direction auto index = match.forward (content); assert (index is 7); // search again - returns length when no match found assert (match.forward(content, index+1) is content.length); --- Searching operates both forward and backward, with an optional start offset (can be more convenient than slicing the content). There are methods to replace matches within given content, and others which return foreach() iterators for traversing content. SearchFruct is a more sophisticated variant, which operates more efficiently on longer matches and/or more extensive content. ******************************************************************************/ private struct FindFruct(T) { private const(T)[] what; /*********************************************************************** Search forward in the given content, starting at the optional index. Returns the index of a match, or content.length where no match was located. ***********************************************************************/ size_t forward (const(T)[] content, size_t ofs = 0) { return Util.index (content, what, ofs); } /*********************************************************************** Search backward in the given content, starting at the optional index. Returns the index of a match, or content.length where no match was located. ***********************************************************************/ size_t reverse (const(T)[] content, size_t ofs = size_t.max) { return Util.rindex (content, what, ofs); } /*********************************************************************** Return the match text ***********************************************************************/ @property const(T)[] match () { return what; } /*********************************************************************** Reset the text to match ***********************************************************************/ @property void match (const(T)[] what) { this.what = what; } /*********************************************************************** Returns true if there is a match within the given content ***********************************************************************/ bool within (const(T)[] content) { return forward(content) != content.length; } /*********************************************************************** Returns number of matches within the given content ***********************************************************************/ size_t count (const(T)[] content) { size_t mark, count; while ((mark = Util.index (content, what, mark)) != content.length) ++count, ++mark; return count; } /*********************************************************************** Replace all matches with the given character. Use method tokens() instead to avoid heap activity. Returns a copy of the content with replacements made ***********************************************************************/ T[] replace (const(T)[] content, T chr) { return replace (content, (&chr)[0..1]); } /*********************************************************************** Replace all matches with the given substitution. Use method tokens() instead to avoid heap activity. Returns a copy of the content with replacements made ***********************************************************************/ T[] replace (const(T)[] content, const(T)[] sub = null) { T[] output; foreach (s; tokens (content, sub)) output ~= s; return output; } /*********************************************************************** Returns a foreach() iterator which exposes text segments between all matches within the given content. Substitution text is also injected in place of each match, and null can be used to indicate removal instead: --- char[] result; auto match = find ("foo"); foreach (token; match.tokens ("$foo&&foo*", "bar")) result ~= token; assert (result == "$bar&&bar*"); --- This mechanism avoids internal heap activity. ***********************************************************************/ Util.PatternFruct!(T) tokens (const(T)[] content, const(T)[] sub = null) { return Util.patterns (content, what, sub); } /*********************************************************************** Returns a foreach() iterator which exposes the indices of all matches within the given content: --- int count; auto f = find ("foo"); foreach (index; f.indices("$foo&&foo*")) ++count; assert (count is 2); --- ***********************************************************************/ Indices indices (const(T)[] content) { return Indices (what, content); } /*********************************************************************** Simple foreach() iterator ***********************************************************************/ private struct Indices { const(T)[] what, content; int opApply (scope int delegate (ref size_t index) dg) { int ret; size_t mark; while ((mark = Util.index(content, what, mark)) != content.length) if ((ret = dg(mark)) is 0) ++mark; else break; return ret; } } } /****************************************************************************** Convenient bundle of welterweight search utilities, without the hassle of IFTI problems. Create one of these using the search() function: --- auto match = search ("foo"); auto content = "wumpus foo bar" // search in the forward direction auto index = match.forward (content); assert (index is 7); // search again - returns length when no match found assert (match.forward(content, index+1) is content.length); --- Searching operates both forward and backward, with an optional start offset (can be more convenient than slicing the content). There are methods to replace matches within given content, and others which return foreach() iterators for traversing content. FindFruct is a simpler variant, which can operate efficiently on short matches and/or short content (employs brute-force strategy) ******************************************************************************/ private struct SearchFruct(T) { private const(T)[] what; private bool fore; private int[256] offsets = void; /*********************************************************************** Construct the fruct ***********************************************************************/ static SearchFruct opCall (const(T)[] what) { SearchFruct find = void; find.match = what; return find; } /*********************************************************************** Return the match text ***********************************************************************/ @property const(T)[] match () { return what; } /*********************************************************************** Reset the text to match ***********************************************************************/ @property void match (const(T)[] what) { offsets[] = cast(int)(what.length + 1); this.fore = true; this.what = what; reset(); } /*********************************************************************** Search forward in the given content, starting at the optional index. Returns the index of a match, or content.length where no match was located. ***********************************************************************/ size_t forward (const(T)[] content, size_t ofs = 0) { if (! fore) flip(); if (ofs > content.length) ofs = content.length; return find (cast(char*) what.ptr, what.length * T.sizeof, cast(char*) content.ptr, content.length * T.sizeof, ofs * T.sizeof) / T.sizeof; } /*********************************************************************** Search backward in the given content, starting at the optional index. Returns the index of a match, or content.length where no match was located. ***********************************************************************/ size_t reverse (const(T)[] content, size_t ofs = size_t.max) { if (fore) flip(); if (ofs > content.length) ofs = content.length; return rfind (cast(char*) what.ptr, what.length * T.sizeof, cast(char*) content.ptr, content.length * T.sizeof, ofs * T.sizeof) / T.sizeof; } /*********************************************************************** Returns true if there is a match within the given content ***********************************************************************/ bool within (const(T)[] content) { return forward(content) != content.length; } /*********************************************************************** Returns number of matches within the given content ***********************************************************************/ size_t count (const(T)[] content) { size_t mark, count; while ((mark = forward (content, mark)) != content.length) ++count, ++mark; return count; } /*********************************************************************** Replace all matches with the given character. Use method tokens() instead to avoid heap activity. Returns a copy of the content with replacements made ***********************************************************************/ T[] replace (const(T)[] content, T chr) { return replace (content, (&chr)[0..1]); } /*********************************************************************** Replace all matches with the given substitution. Use method tokens() instead to avoid heap activity. Returns a copy of the content with replacements made ***********************************************************************/ T[] replace (const(T)[] content, const(T)[] sub = null) { T[] output; foreach (s; tokens (content, sub)) output ~= s; return output; } /*********************************************************************** Returns a foreach() iterator which exposes text segments between all matches within the given content. Substitution text is also injected in place of each match, and null can be used to indicate removal instead: --- char[] result; auto match = search ("foo"); foreach (token; match.tokens("$foo&&foo*", "bar")) result ~= token; assert (result == "$bar&&bar*"); --- This mechanism avoids internal heap activity ***********************************************************************/ Substitute tokens (const(T)[] content, const(T)[] sub = null) { return Substitute (sub, what, content, &forward); } /*********************************************************************** Returns a foreach() iterator which exposes the indices of all matches within the given content: --- int count; auto match = search ("foo"); foreach (index; match.indices("$foo&&foo*")) ++count; assert (count is 2); --- ***********************************************************************/ Indices indices (const(T)[] content) { return Indices (content, &forward); } /*********************************************************************** ***********************************************************************/ private size_t find (char* what, size_t wlen, char* content, size_t len, size_t ofs) { auto s = content; content += ofs; auto e = s + len - wlen; while (content <= e) if (*what is *content && matches(what, content, wlen)) return content - s; else content += offsets [content[wlen]]; return len; } /*********************************************************************** ***********************************************************************/ private size_t rfind (char* what, size_t wlen, char* content, size_t len, size_t ofs) { auto s = content; auto e = s + ofs - wlen; while (e >= content) if (*what is *e && matches(what, e, wlen)) return e - s; else e -= offsets [*(e-1)]; return len; } /*********************************************************************** ***********************************************************************/ private static bool matches (char* a, char* b, size_t length) { while (length > size_t.sizeof) if (*cast(size_t*) a is *cast(size_t*) b) a += size_t.sizeof, b += size_t.sizeof, length -= size_t.sizeof; else return false; while (length--) if (*a++ != *b++) return false; return true; } /*********************************************************************** Construct lookup table. We force the alphabet to be char[] always, and consider wider characters to be longer patterns instead ***********************************************************************/ private void reset () { auto what = cast(char[]) this.what; if (fore) for (int i=0; i < cast(int)what.length; ++i) offsets[what[i]] = cast(int)what.length - i; else for (int i=cast(int)what.length; i--;) offsets[what[i]] = cast(int)(i+1); } /*********************************************************************** Reverse lookup-table direction ***********************************************************************/ private void flip () { fore ^= true; reset(); } /*********************************************************************** Simple foreach() iterator ***********************************************************************/ private struct Indices { const(T)[] content; size_t delegate(const(T)[], size_t) call; int opApply (scope int delegate (ref size_t index) dg) { int ret; size_t mark; while ((mark = call(content, mark)) != content.length) if ((ret = dg(mark)) is 0) ++mark; else break; return ret; } } /*********************************************************************** Substitution foreach() iterator ***********************************************************************/ private struct Substitute { private const(T)[] sub, what, content; size_t delegate(const(T)[], size_t) call; int opApply (scope int delegate (ref const(T)[] token) dg) { int ret; size_t pos, mark; const(T)[] token; while ((pos = call (content, mark)) < content.length) { token = content [mark .. pos]; if ((ret = dg(token)) != 0) return ret; if (sub.ptr && (ret = dg(sub)) != 0) return ret; mark = pos + what.length; } token = content [mark .. $]; if (mark <= content.length) ret = dg (token); return ret; } } } /****************************************************************************** ******************************************************************************/ debug (Search) { import tango.io.Stdout; import tango.time.StopWatch; auto x = import("Search.d"); void main() { StopWatch elapsed; auto match = search("foo"); auto index = match.reverse ("foo foo"); assert (index is 4); index = match.reverse ("foo foo", index); assert (index is 0); index = match.reverse ("foo foo", 1); assert (index is 7); foreach (index; find("delegate").indices(x)) Stdout.formatln ("< {}", index); foreach (index; search("delegate").indices(x)) Stdout.formatln ("> {}", index); elapsed.start; for (auto i=5000; i--;) Util.mismatch (x.ptr, x.ptr, x.length); Stdout.formatln ("mismatch {}", elapsed.stop); elapsed.start; for (auto i=5000; i--;) Util.indexOf (x.ptr, '@', cast(uint) x.length); Stdout.formatln ("indexOf {}", elapsed.stop); elapsed.start; for (auto i=5000; i--;) Util.locatePattern (x, "indexOf {}"); Stdout.formatln ("pattern {}", elapsed.stop); elapsed.start; auto f = find ("indexOf {}"); for (auto i=5000; i--;) f.forward(x); Stdout.formatln ("find {}", elapsed.stop); elapsed.start; auto s = search ("indexOf {}"); for (auto i=5000; i--;) s.forward(x); Stdout.formatln ("search {}", elapsed.stop); } } |