123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688 |
|
/*******************************************************************************
copyright: Copyright (c) 2009 Kris Bell. All rights reserved
license: BSD style: $(LICENSE)
version: May 2009: Initial release
since: 0.99.9
author: Kris
*******************************************************************************/
module tango.text.Search;
private import Util = tango.text.Util;
/******************************************************************************
Returns a lightweight pattern matcher, good for short patterns
and/or short to medium length content. Brute-force approach with
fast multi-byte comparisons
******************************************************************************/
FindFruct!(T) find(T) (const(T)[] what)
{
return FindFruct!(T) (what);
}
/******************************************************************************
Returns a welterweight pattern matcher, good for long patterns
and/or extensive content. Based on the QS algorithm which is a
Boyer-Moore variant. Does not allocate memory for the alphabet.
Generally becomes faster as the match-length grows
******************************************************************************/
SearchFruct!(T) search(T) (const(T)[] what)
{
return SearchFruct!(T) (what);
}
/******************************************************************************
Convenient bundle of lightweight find utilities, without the
hassle of IFTI problems. Create one of these using the find()
function:
---
auto match = find ("foo");
auto content = "wumpus foo bar"
// search in the forward direction
auto index = match.forward (content);
assert (index is 7);
// search again - returns length when no match found
assert (match.forward(content, index+1) is content.length);
---
Searching operates both forward and backward, with an optional
start offset (can be more convenient than slicing the content).
There are methods to replace matches within given content, and
others which return foreach() iterators for traversing content.
SearchFruct is a more sophisticated variant, which operates more
efficiently on longer matches and/or more extensive content.
******************************************************************************/
private struct FindFruct(T)
{
private const(T)[] what;
/***********************************************************************
Search forward in the given content, starting at the
optional index.
Returns the index of a match, or content.length where
no match was located.
***********************************************************************/
size_t forward (const(T)[] content, size_t ofs = 0)
{
return Util.index (content, what, ofs);
}
/***********************************************************************
Search backward in the given content, starting at the
optional index.
Returns the index of a match, or content.length where
no match was located.
***********************************************************************/
size_t reverse (const(T)[] content, size_t ofs = size_t.max)
{
return Util.rindex (content, what, ofs);
}
/***********************************************************************
Return the match text
***********************************************************************/
@property
const(T)[] match ()
{
return what;
}
/***********************************************************************
Reset the text to match
***********************************************************************/
@property
void match (const(T)[] what)
{
this.what = what;
}
/***********************************************************************
Returns true if there is a match within the given content
***********************************************************************/
bool within (const(T)[] content)
{
return forward(content) != content.length;
}
/***********************************************************************
Returns number of matches within the given content
***********************************************************************/
size_t count (const(T)[] content)
{
size_t mark, count;
while ((mark = Util.index (content, what, mark)) != content.length)
++count, ++mark;
return count;
}
/***********************************************************************
Replace all matches with the given character. Use method
tokens() instead to avoid heap activity.
Returns a copy of the content with replacements made
***********************************************************************/
T[] replace (const(T)[] content, T chr)
{
return replace (content, (&chr)[0..1]);
}
/***********************************************************************
Replace all matches with the given substitution. Use
method tokens() instead to avoid heap activity.
Returns a copy of the content with replacements made
***********************************************************************/
T[] replace (const(T)[] content, const(T)[] sub = null)
{
T[] output;
foreach (s; tokens (content, sub))
output ~= s;
return output;
}
/***********************************************************************
Returns a foreach() iterator which exposes text segments
between all matches within the given content. Substitution
text is also injected in place of each match, and null can
be used to indicate removal instead:
---
char[] result;
auto match = find ("foo");
foreach (token; match.tokens ("$foo&&foo*", "bar"))
result ~= token;
assert (result == "$bar&&bar*");
---
This mechanism avoids internal heap activity.
***********************************************************************/
Util.PatternFruct!(T) tokens (const(T)[] content, const(T)[] sub = null)
{
return Util.patterns (content, what, sub);
}
/***********************************************************************
Returns a foreach() iterator which exposes the indices of
all matches within the given content:
---
int count;
auto f = find ("foo");
foreach (index; f.indices("$foo&&foo*"))
++count;
assert (count is 2);
---
***********************************************************************/
Indices indices (const(T)[] content)
{
return Indices (what, content);
}
/***********************************************************************
Simple foreach() iterator
***********************************************************************/
private struct Indices
{
const(T)[] what,
content;
int opApply (scope int delegate (ref size_t index) dg)
{
int ret;
size_t mark;
while ((mark = Util.index(content, what, mark)) != content.length)
if ((ret = dg(mark)) is 0)
++mark;
else
break;
return ret;
}
}
}
/******************************************************************************
Convenient bundle of welterweight search utilities, without the
hassle of IFTI problems. Create one of these using the search()
function:
---
auto match = search ("foo");
auto content = "wumpus foo bar"
// search in the forward direction
auto index = match.forward (content);
assert (index is 7);
// search again - returns length when no match found
assert (match.forward(content, index+1) is content.length);
---
Searching operates both forward and backward, with an optional
start offset (can be more convenient than slicing the content).
There are methods to replace matches within given content, and
others which return foreach() iterators for traversing content.
FindFruct is a simpler variant, which can operate efficiently on
short matches and/or short content (employs brute-force strategy)
******************************************************************************/
private struct SearchFruct(T)
{
private const(T)[] what;
private bool fore;
private int[256] offsets = void;
/***********************************************************************
Construct the fruct
***********************************************************************/
static SearchFruct opCall (const(T)[] what)
{
SearchFruct find = void;
find.match = what;
return find;
}
/***********************************************************************
Return the match text
***********************************************************************/
@property
const(T)[] match ()
{
return what;
}
/***********************************************************************
Reset the text to match
***********************************************************************/
@property
void match (const(T)[] what)
{
offsets[] = cast(int)(what.length + 1);
this.fore = true;
this.what = what;
reset();
}
/***********************************************************************
Search forward in the given content, starting at the
optional index.
Returns the index of a match, or content.length where
no match was located.
***********************************************************************/
size_t forward (const(T)[] content, size_t ofs = 0)
{
if (! fore)
flip();
if (ofs > content.length)
ofs = content.length;
return find (cast(char*) what.ptr, what.length * T.sizeof,
cast(char*) content.ptr, content.length * T.sizeof,
ofs * T.sizeof) / T.sizeof;
}
/***********************************************************************
Search backward in the given content, starting at the
optional index.
Returns the index of a match, or content.length where
no match was located.
***********************************************************************/
size_t reverse (const(T)[] content, size_t ofs = size_t.max)
{
if (fore)
flip();
if (ofs > content.length)
ofs = content.length;
return rfind (cast(char*) what.ptr, what.length * T.sizeof,
cast(char*) content.ptr, content.length * T.sizeof,
ofs * T.sizeof) / T.sizeof;
}
/***********************************************************************
Returns true if there is a match within the given content
***********************************************************************/
bool within (const(T)[] content)
{
return forward(content) != content.length;
}
/***********************************************************************
Returns number of matches within the given content
***********************************************************************/
size_t count (const(T)[] content)
{
size_t mark, count;
while ((mark = forward (content, mark)) != content.length)
++count, ++mark;
return count;
}
/***********************************************************************
Replace all matches with the given character. Use method
tokens() instead to avoid heap activity.
Returns a copy of the content with replacements made
***********************************************************************/
T[] replace (const(T)[] content, T chr)
{
return replace (content, (&chr)[0..1]);
}
/***********************************************************************
Replace all matches with the given substitution. Use
method tokens() instead to avoid heap activity.
Returns a copy of the content with replacements made
***********************************************************************/
T[] replace (const(T)[] content, const(T)[] sub = null)
{
T[] output;
foreach (s; tokens (content, sub))
output ~= s;
return output;
}
/***********************************************************************
Returns a foreach() iterator which exposes text segments
between all matches within the given content. Substitution
text is also injected in place of each match, and null can
be used to indicate removal instead:
---
char[] result;
auto match = search ("foo");
foreach (token; match.tokens("$foo&&foo*", "bar"))
result ~= token;
assert (result == "$bar&&bar*");
---
This mechanism avoids internal heap activity
***********************************************************************/
Substitute tokens (const(T)[] content, const(T)[] sub = null)
{
return Substitute (sub, what, content, &forward);
}
/***********************************************************************
Returns a foreach() iterator which exposes the indices of
all matches within the given content:
---
int count;
auto match = search ("foo");
foreach (index; match.indices("$foo&&foo*"))
++count;
assert (count is 2);
---
***********************************************************************/
Indices indices (const(T)[] content)
{
return Indices (content, &forward);
}
/***********************************************************************
***********************************************************************/
private size_t find (char* what, size_t wlen, char* content, size_t len, size_t ofs)
{
auto s = content;
content += ofs;
auto e = s + len - wlen;
while (content <= e)
if (*what is *content && matches(what, content, wlen))
return content - s;
else
content += offsets [content[wlen]];
return len;
}
/***********************************************************************
***********************************************************************/
private size_t rfind (char* what, size_t wlen, char* content, size_t len, size_t ofs)
{
auto s = content;
auto e = s + ofs - wlen;
while (e >= content)
if (*what is *e && matches(what, e, wlen))
return e - s;
else
e -= offsets [*(e-1)];
return len;
}
/***********************************************************************
***********************************************************************/
private static bool matches (char* a, char* b, size_t length)
{
while (length > size_t.sizeof)
if (*cast(size_t*) a is *cast(size_t*) b)
a += size_t.sizeof, b += size_t.sizeof, length -= size_t.sizeof;
else
return false;
while (length--)
if (*a++ != *b++)
return false;
return true;
}
/***********************************************************************
Construct lookup table. We force the alphabet to be char[]
always, and consider wider characters to be longer patterns
instead
***********************************************************************/
private void reset ()
{
auto what = cast(char[]) this.what;
if (fore)
for (int i=0; i < cast(int)what.length; ++i)
offsets[what[i]] = cast(int)what.length - i;
else
for (int i=cast(int)what.length; i--;)
offsets[what[i]] = cast(int)(i+1);
}
/***********************************************************************
Reverse lookup-table direction
***********************************************************************/
private void flip ()
{
fore ^= true;
reset();
}
/***********************************************************************
Simple foreach() iterator
***********************************************************************/
private struct Indices
{
const(T)[] content;
size_t delegate(const(T)[], size_t) call;
int opApply (scope int delegate (ref size_t index) dg)
{
int ret;
size_t mark;
while ((mark = call(content, mark)) != content.length)
if ((ret = dg(mark)) is 0)
++mark;
else
break;
return ret;
}
}
/***********************************************************************
Substitution foreach() iterator
***********************************************************************/
private struct Substitute
{
private const(T)[] sub,
what,
content;
size_t delegate(const(T)[], size_t) call;
int opApply (scope int delegate (ref const(T)[] token) dg)
{
int ret;
size_t pos,
mark;
const(T)[] token;
while ((pos = call (content, mark)) < content.length)
{
token = content [mark .. pos];
if ((ret = dg(token)) != 0)
return ret;
if (sub.ptr && (ret = dg(sub)) != 0)
return ret;
mark = pos + what.length;
}
token = content [mark .. $];
if (mark <= content.length)
ret = dg (token);
return ret;
}
}
}
/******************************************************************************
******************************************************************************/
debug (Search)
{
import tango.io.Stdout;
import tango.time.StopWatch;
auto x = import("Search.d");
void main()
{
StopWatch elapsed;
auto match = search("foo");
auto index = match.reverse ("foo foo");
assert (index is 4);
index = match.reverse ("foo foo", index);
assert (index is 0);
index = match.reverse ("foo foo", 1);
assert (index is 7);
foreach (index; find("delegate").indices(x))
Stdout.formatln ("< {}", index);
foreach (index; search("delegate").indices(x))
Stdout.formatln ("> {}", index);
elapsed.start;
for (auto i=5000; i--;)
Util.mismatch (x.ptr, x.ptr, x.length);
Stdout.formatln ("mismatch {}", elapsed.stop);
elapsed.start;
for (auto i=5000; i--;)
Util.indexOf (x.ptr, '@', cast(uint) x.length);
Stdout.formatln ("indexOf {}", elapsed.stop);
elapsed.start;
for (auto i=5000; i--;)
Util.locatePattern (x, "indexOf {}");
Stdout.formatln ("pattern {}", elapsed.stop);
elapsed.start;
auto f = find ("indexOf {}");
for (auto i=5000; i--;)
f.forward(x);
Stdout.formatln ("find {}", elapsed.stop);
elapsed.start;
auto s = search ("indexOf {}");
for (auto i=5000; i--;)
s.forward(x);
Stdout.formatln ("search {}", elapsed.stop);
}
}
|