tango.io.stream.Iterator

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292
/*******************************************************************************

        copyright:      Copyright (c) 2004 Kris Bell. All rights reserved

        license:        BSD style: $(LICENSE)

        version:        Initial release: December 2005

        author:         Kris

*******************************************************************************/

module tango.io.stream.Iterator;

private import tango.io.stream.Buffered;

protected import tango.io.device.Conduit : InputFilter, InputBuffer, InputStream;

/*******************************************************************************

        The base class for a set of stream iterators. These operate
        upon a buffered input stream, and are designed to deal with
        partial content. That is, stream iterators go to work the
        moment any data becomes available in the buffer. Contrast
        this behaviour with the tango.text.Util iterators, which
        operate upon the extent of an array.

        There are two types of iterators supported; exclusive and
        inclusive. The former are the more common kind, where a token
        is delimited by elements that are considered foreign. Examples
        include space, comma, and end-of-line delineation. Inclusive
        tokens are just the opposite: they look for patterns in the
        text that should be part of the token itself - everything else
        is considered foreign. Currently tango.io.stream includes the
        exclusive variety only.

        Each pattern is exposed to the client as a slice of the original
        content, where the slice is transient. If you need to retain the
        exposed content, then you should .dup it appropriately.

        The content provided to these iterators is intended to be fully
        read-only. All current tokenizers abide by this rule, but it is
        possible a user could mutate the content through a token slice.
        To enforce the desired read-only aspect, the code would have to
        introduce redundant copying or the compiler would have to support
        read-only arrays (now in D2).

        See Delimiters, Lines, Patterns, Quotes.

*******************************************************************************/

class Iterator(T) : InputFilter
{
        private InputBuffer     source;
        protected const(T)[]    slice,
                                delim;

        /***********************************************************************

                The pattern scanner, implemented via subclasses.

        ***********************************************************************/

        abstract protected size_t scan (const(void)[] data);

        /***********************************************************************

                Instantiate with a buffer.

        ***********************************************************************/

        this (InputStream stream = null)
        {
                super (stream);
                if (stream)
                    set (stream);
        }

        /***********************************************************************

                Set the provided stream as the scanning source.

        ***********************************************************************/

        Iterator set (InputStream stream)
        {
                assert (stream);
                source = BufferedInput.create (stream);
                super.source = source;
                return this;
        }

        /***********************************************************************

                Return the current token as a slice of the content.

        ***********************************************************************/

        final const(T)[] get ()
        {
                return slice;
        }

        /**********************************************************************

                Iterate over the set of tokens. This should really
                provide read-only access to the tokens, but D does
                not support that at this time.

        **********************************************************************/

        int opApply (scope int delegate(ref const(T)[]) dg)
        {
                bool more;
                int  result;

                do {
                   more = consume();
                   result = dg (slice);
                   } while (more && !result);
                return result;
        }

        /**********************************************************************

                Iterate over a set of tokens, exposing a token count
                starting at zero.

        **********************************************************************/

        int opApply (scope int delegate(ref int, ref const(T)[]) dg)
        {
                bool more;
                int  result,
                     tokens;

                do {
                   more = consume();
                   result = dg (tokens, slice);
                   ++tokens;
                   } while (more && !result);
                return result;
        }

        /**********************************************************************

                Iterate over a set of tokens and delimiters, exposing a
                token count starting at zero.

        **********************************************************************/

        int opApply (scope int delegate(ref int, ref const(T)[], ref const(T)[]) dg)
        {
                bool more;
                int  result,
                     tokens;

                do {
                   delim = null;
                   more = consume();
                   result = dg (tokens, slice, delim);
                   ++tokens;
                   } while (more && !result);
                return result;
        }

        /***********************************************************************

                Locate the next token. Returns the token if found, null
                otherwise. Null indicates an end of stream condition. To
                sweep a conduit for lines using method next():
                ---
                auto lines = new Lines!(char) (new File("myfile"));
                while (lines.next)
                       Cout (lines.get).newline;
                ---

                Alternatively, we can extract one line from a conduit:
                ---
                auto line = (new Lines!(char) (new File("myfile"))).next;
                ---

                The difference between next() and foreach() is that the
                latter processes all tokens in one go, whereas the former
                processes in a piecemeal fashion. To wit:
                ---
                foreach (line; new Lines!(char) (new File("myfile")))
                         Cout(line).newline;
                ---

        ***********************************************************************/

        @property final const(T)[] next ()
        {
                if (consume() || slice.length)
                    return slice;
                return null;
        }

        /***********************************************************************

                Set the content of the current slice to the provided
                start and end points.

        ***********************************************************************/

        protected final size_t set (const(T)* content, size_t start, size_t end)
        {
                slice = content [start .. end];
                return end;
        }

        /***********************************************************************

                Set the content of the current slice to the provided
                start and end points, and delimiter to the segment
                between end &amp; next (inclusive.)

        ***********************************************************************/

        protected final size_t set (const(T)* content, size_t start, size_t end, size_t next)
        {
                slice = content [start .. end];
                delim = content [end .. next+1];
                return end;
        }

        /***********************************************************************

                Called when a scanner fails to find a matching pattern.
                This may cause more content to be loaded, and a rescan
                initiated.

        ***********************************************************************/

        protected final size_t notFound ()
        {
                return Eof;
        }

        /***********************************************************************

                Invoked when a scanner matches a pattern. The provided
                value should be the index of the last element of the
                matching pattern, which is converted back to a void[]
                index.

        ***********************************************************************/

        protected final size_t found (size_t i)
        {
                return (i + 1) * T.sizeof;
        }

        /***********************************************************************

                See if set of characters holds a particular instance.

        ***********************************************************************/

        protected final bool has (const(T)[] set, T match)
        {
                foreach (T c; set)
                         if (match is c)
                             return true;
                return false;
        }

        /***********************************************************************

                Consume the next token and place it in 'slice'. Returns
                true when there are potentially more tokens.

        ***********************************************************************/

        private bool consume ()
        {
                if (source.next (&scan))
                    return true;

                // consume trailing token
                source.reader ((const(void)[] arr)
                              {
                              slice = (cast(const(T)*) arr.ptr) [0 .. arr.length/T.sizeof];
                              return cast(size_t)arr.length;
                              });
                return false;
        }
}