123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782
/*******************************************************************************

        copyright:      Copyright (c) 2004 Kris Bell. All rights reserved

        license:        BSD style: $(LICENSE)
        
        version:        Initial release: Nov 2005
        
        author:         Kris

        A set of functions for converting between string and integer 
        values. 

        Applying the D "import alias" mechanism to this module is highly
        recommended, in order to limit namespace pollution:
        ---
        import Integer = tango.text.convert.Integer;

        auto i = Integer.parse ("32767");
        ---
        
*******************************************************************************/

module tango.text.convert.Integer;

private import tango.core.Exception;
private import tango.core.Octal;

/******************************************************************************

        Parse an integer value from the provided 'digits' string. 

        The string is inspected for a sign and an optional radix 
        prefix. A radix may be provided as an argument instead, 
        whereupon it must match the prefix (where present). When
        radix is set to zero, conversion will default to decimal.

        Throws: IllegalArgumentException where the input text is not parsable
        in its entirety.

        See_also: the low level functions parse() and convert()

******************************************************************************/
int toInt(T) (const(T[]) digits, uint radix=0)
{
        auto x = toLong (digits, radix);
        if (x > int.max)
            throw new IllegalArgumentException ("Integer.toInt :: integer overflow");
        return cast(int) x;
}

/******************************************************************************

        Parse an integer value from the provided 'digits' string.

        The string is inspected for a sign and an optional radix
        prefix. A radix may be provided as an argument instead,
        whereupon it must match the prefix (where present). When
        radix is set to zero, conversion will default to decimal.

        Throws: IllegalArgumentException where the input text is not parsable
        in its entirety.

        See_also: the low level functions parse() and convert()

******************************************************************************/
long toLong(T) (const(T[]) digits, uint radix=0)
{
        size_t len;

        auto x = parse (digits, radix, &len);
        if (len < digits.length)
            throw new IllegalArgumentException ("Integer.toLong :: invalid literal");
        return x;
}

/******************************************************************************

        Parse an unsignedinteger value from the provided 'digits' string.

        The string is inspected for an optional radix prefix. A
        radix may be provided as an argument instead, whereupon
        it must match the prefix (where present). When radix is
        set to zero, conversion will default to decimal.

        Throws: IllegalArgumentException where the input text is not parsable
        in its entirety.

        See_also: the low level functions parse() and convert()

******************************************************************************/
ulong toUlong(T) (const(T[]) digits, uint radix=0)
{
        bool sign = false;

        auto eaten = trim (digits, sign, radix);
        if (sign)
            throw new IllegalArgumentException ("Integer.toUlong :: invalid literal");

        size_t len = 0;
        auto value = convert (digits[eaten..$], radix, &len);
        if (len == 0 || eaten + len < digits.length)
            throw new IllegalArgumentException ("Integer.toUlong :: invalid literal");

        return value;
}

/******************************************************************************

        Wrapper to make life simpler. Returns a text version
        of the provided value.

        See format() for details

******************************************************************************/

char[] toString (long i, const(char[]) fmt = null)
{
        char[66] tmp = void;
        return format (tmp, i, fmt).dup;
}
               
/******************************************************************************

        Wrapper to make life simpler. Returns a text version
        of the provided value.

        See format() for details

******************************************************************************/

wchar[] toString16 (long i, const(wchar[]) fmt = null)
{
        wchar[66] tmp = void;
        return format (tmp, i, fmt).dup;
}
               
/******************************************************************************

        Wrapper to make life simpler. Returns a text version
        of the provided value.

        See format() for details

******************************************************************************/

dchar[] toString32 (long i, const(dchar[]) fmt = null)
{
        dchar[66] tmp = void;
        return format (tmp, i, fmt).dup;
}
               
/*******************************************************************************

        Supports format specifications via an array, where format follows
        the notation given below:
        ---
        type width prefix
        ---

        Type is one of [d, g, u, b, x, o] or uppercase equivalent, and
        dictates the conversion radix or other semantics.

        Width is optional and indicates a minimum width for zero-padding,
        while the optional prefix is one of ['#', ' ', '+'] and indicates
        what variety of prefix should be placed in the output. e.g.
        ---
        "d"     => integer
        "u"     => unsigned
        "o"     => octal
        "b"     => binary
        "x"     => hexadecimal
        "X"     => hexadecimal uppercase

        "d+"    => integer prefixed with "+"
        "b#"    => binary prefixed with "0b"
        "x#"    => hexadecimal prefixed with "0x"
        "X#"    => hexadecimal prefixed with "0X"

        "d8"    => decimal padded to 8 places as required
        "b8"    => binary padded to 8 places as required
        "b8#"   => binary padded to 8 places and prefixed with "0b"
        ---

        Note that the specified width is exclusive of the prefix, though
        the width padding will be shrunk as necessary in order to ensure
        a requested prefix can be inserted into the provided output.

*******************************************************************************/
T[] format(T) (T[] dst, long i, const(T[]) fmt = null)
{
        char    pre,
                type;
        int     width;

        decode (fmt, type, pre, width);
        return formatter (dst, i, type, pre, width);
} 

private void decode(T) (in T[] fmt, ref char type, out char pre, out int width)
{
        if (fmt.length is 0)
            type = 'd';
        else
           {
           type = cast(char)fmt[0];
           if (fmt.length > 1)
              {
              auto p = &fmt[1];
              for (int j=1; j < fmt.length; ++j, ++p)
                   if (*p >= '0' && *p <= '9')
                       width = width * 10 + (*p - '0');
                   else
                      pre = cast(char)*p;
              }
           }
} 
private struct _FormatterInfo(T)
{
    uint    radix;
    immutable(T)[]     prefix;
    immutable(T)[]     numbers;
}

T[] formatter(T) (T[] dst, long i, char type, char pre, int width)
{
        enum immutable(T)[] lower = "0123456789abcdef";
        enum immutable(T)[] upper = "0123456789ABCDEF";
        
        alias _FormatterInfo!(T) Info;

        enum   Info[] formats = 
                [
                {10, null, lower}, 
                {10, "-",  lower}, 
                {10, " ",  lower}, 
                {10, "+",  lower}, 
                { 2, "0b", lower}, 
                { 8, "0o", lower}, 
                {16, "0x", lower}, 
                {16, "0X", upper},
                ];

        ubyte index;
        int len = cast(int)dst.length;

        if (len)
           {
           switch (type)
                  {
                  case 'd':
                  case 'D':
                  case 'g':
                  case 'G':
                       if (i < 0)
                          {
                          index = 1;
                          i = -i;
                          }
                       else
                          if (pre is ' ')
                              index = 2;
                          else
                             if (pre is '+')
                                 index = 3;
                       goto case 'U';
                  case 'u':
                  case 'U':
                       pre = '#';
                       break;

                  case 'b':
                  case 'B':
                       index = 4;
                       break;

                  case 'o':
                  case 'O':
                       index = 5;
                       break;

                  case 'x':
                       index = 6;
                       break;

                  case 'X':
                       index = 7;
                       break;

                  default:
                        return cast(T[])"{unknown format '".dup~cast(T)type~cast(T[])"'}".dup;
                  }

           auto info = &formats[index];
           auto numbers = info.numbers;
           auto radix = info.radix;

           // convert number to text
           auto p = dst.ptr + len;
           if (uint.max >= cast(ulong) i)
              {
              auto v = cast (uint) i;
              do {
                 *--p = numbers [v % radix];
                 } while ((v /= radix) && --len);
              }
           else
              {
              auto v = cast (ulong) i;
              do {
                 *--p = numbers [cast(uint) (v % radix)];
                 } while ((v /= radix) && --len);
              }
        
           auto prefix = (pre is '#') ? info.prefix : null;
           if (len > prefix.length)
              {
              len -= prefix.length + 1;

              // prefix number with zeros? 
              if (width)
                 {
                 width = cast(int)dst.length - width - cast(int)prefix.length;
                 while (len > width && len > 0)
                       {
                       *--p = '0';
                       --len;
                       }
                 }
              // write optional prefix string ...
              dst [len .. len + prefix.length] = prefix;

              // return slice of provided output buffer
              return dst [len .. $];                               
              }
           }
        
        return cast(T[])"{output width too small}".dup;
} 


/******************************************************************************

        Parse an integer value from the provided 'digits' string. 

        The string is inspected for a sign and an optional radix 
        prefix. A radix may be provided as an argument instead, 
        whereupon it must match the prefix (where present). When
        radix is set to zero, conversion will default to decimal.

        A non-null 'ate' will return the number of characters used
        to construct the returned value.

        Throws: none. The 'ate' param should be checked for valid input.

******************************************************************************/
long parse(T) (T[] digits, uint radix=0, size_t* ate=null)
{
        bool sign;

        auto eaten = trim (digits, sign, radix);
        auto value = convert (digits[eaten..$], radix, ate);

        // check *ate > 0 to make sure we don't parse "-" as 0.
        if (ate && *ate > 0)
            *ate += eaten;

        return cast(long) (sign ? -value : value);
}

/******************************************************************************

        Convert the provided 'digits' into an integer value,
        without checking for a sign or radix. The radix defaults
        to decimal (10).

        Returns the value and updates 'ate' with the number of
        characters consumed.

        Throws: none. The 'ate' param should be checked for valid input.

******************************************************************************/
ulong convert(T) (const(T[]) digits, uint radix=10, size_t* ate=null)
{
        uint  eaten;
        ulong value;

        foreach (c; cast(T[])digits)
                {
                if (c >= '0' && c <= '9')
                   {}
                else
                   if (c >= 'a' && c <= 'z')
                       c -= 39;
                   else
                      if (c >= 'A' && c <= 'Z')
                          c -= 7;
                      else
                         break;

                if ((c -= '0') < radix)
                   {
                   value = value * radix + c;
                   ++eaten;
                   }
                else
                   break;
                }

        if (ate)
            *ate = eaten;

        return value;
}


/******************************************************************************

        Strip leading whitespace, extract an optional +/- sign,
        and an optional radix prefix. If the radix value matches
        an optional prefix, or the radix is zero, the prefix will
        be consumed and assigned. Where the radix is non zero and
        does not match an explicit prefix, the latter will remain 
        unconsumed. Otherwise, radix will default to 10.

        Returns the number of characters consumed.

******************************************************************************/
size_t trim(T) (const(T[]) digits, ref bool sign, ref uint radix)
{
        T       c;
        const (T)*      p = digits.ptr;
        auto     len = digits.length;

        if (len)
           {
           // strip off whitespace and sign characters
           for (c = *p; len; c = *++p, --len)
                if (c is ' ' || c is '\t')
                   {}
                else
                   if (c is '-')
                       sign = true;
                   else
                      if (c is '+')
                          sign = false;
                   else
                      break;

           // strip off a radix specifier also?
           auto r = radix;
           if (c is '0' && len > 1)
               switch (*++p)
                      {
                      case 'x':
                      case 'X':
                           ++p;
                           r = 16;
                           break;
 
                      case 'b':
                      case 'B':
                           ++p;
                           r = 2;
                           break;
 
                      case 'o':
                      case 'O':
                           ++p;
                           r = 8;
                           break;
 
                      default: 
                            --p;
                           break;
                      } 

           // default the radix to 10
           if (r is 0)
               radix = 10;
           else
              // explicit radix must match (optional) prefix
              if (radix != r)
              {
                  if (radix)
                      p -= 2;
                  else
                     radix = r;
              }
           }

        // return number of characters eaten
        return (p - digits.ptr);
}


/******************************************************************************

        quick & dirty text-to-unsigned int converter. Use only when you
        know what the content is, or use parse() or convert() instead.

        Return the parsed uint
        
******************************************************************************/

uint atoi(T) (T[] s, int radix = 10)
{
        uint value;

        foreach (c; s)
                 if (c >= '0' && c <= '9')
                     value = value * radix + (c - '0');
                 else
                    break;
        return value;
}


/******************************************************************************

        quick & dirty unsigned to text converter, where the provided output
        must be large enough to house the result (10 digits in the largest
        case). For mainstream use, consider utilizing format() instead.

        Returns a populated slice of the provided output
        
******************************************************************************/
T[] itoa(T) (T[] output, uint value, int radix = 10)
{
        T* p = output.ptr + output.length;

        do {
           *--p = cast(T)(value % radix + '0');
           } while (value /= radix);
        return output[cast(size_t) (p-output.ptr) .. $];
}


/******************************************************************************

        Consume a number from the input without converting it. Argument
        'fp' enables floating-point consumption. Supports hex input for
        numbers which are prefixed appropriately

        Since version 0.99.9

******************************************************************************/

T[] consume(T) (T[] src, bool fp=false)
{
        T       c;
        bool    sign;
        uint    radix;

        // remove leading space, and sign
        auto e = src.ptr + src.length;
        auto p = src.ptr + trim (src, sign, radix);
        auto b = p;

        // bail out if the string is empty
        if (src.length is 0 || p > &src[$-1])
            return null;

        // read leading digits
        for (c=*p; p < e && ((c >= '0' && c <= '9') || 
            (radix is 16 && ((c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'))));)
             c = *++p;

        if (fp)
           {
           // gobble up a point
           if (c is '.' && p < e)
               c = *++p;

           // read fractional digits
           while (c >= '0' && c <= '9' && p < e)
                  c = *++p;

           // did we consume anything?
           if (p > b)
              {
              // consume exponent?
              if ((c is 'e' || c is 'E') && p < e )
                 {
                 c = *++p;
                 if (c is '+' || c is '-')
                     c = *++p;
                 while (c >= '0' && c <= '9' && p < e)
                        c = *++p;
                 }
              }
           }
        return src [0 .. p-src.ptr];
}


/******************************************************************************

******************************************************************************/

debug (UnitTest)
{

        unittest
        {
        char[64] tmp;
        
        assert (toInt("1") is 1);
        assert (toLong("1") is 1);
        assert (toInt("1", 10) is 1);
        assert (toLong("1", 10) is 1);
        assert (toUlong("1", 10) is 1);
        assert (toUlong("18446744073709551615") is ulong.max);

        assert (atoi ("12345") is 12345);
        assert (itoa (tmp, 12345) == "12345");

        assert(parse( "0"w ) ==  0 );
        assert(parse( "1"w ) ==  1 );
        assert(parse( "-1"w ) ==  -1 );
        assert(parse( "+1"w ) ==  1 );

        // numerical limits
        assert(parse( "-2147483648" ) == int.min );
        assert(parse(  "2147483647" ) == int.max );
        assert(parse(  "4294967295" ) == uint.max );

        assert(parse( "-9223372036854775808" ) == long.min );
        assert(parse( "9223372036854775807" ) == long.max );
        assert(parse( "18446744073709551615" ) == ulong.max );

        // hex
        assert(parse( "a", 16) == 0x0A );
        assert(parse( "b", 16) == 0x0B );
        assert(parse( "c", 16) == 0x0C );
        assert(parse( "d", 16) == 0x0D );
        assert(parse( "e", 16) == 0x0E );
        assert(parse( "f", 16) == 0x0F );
        assert(parse( "A", 16) == 0x0A );
        assert(parse( "B", 16) == 0x0B );
        assert(parse( "C", 16) == 0x0C );
        assert(parse( "D", 16) == 0x0D );
        assert(parse( "E", 16) == 0x0E );
        assert(parse( "F", 16) == 0x0F );
        assert(parse( "FFFF", 16) == ushort.max );
        assert(parse( "ffffFFFF", 16) == uint.max );
        assert(parse( "ffffFFFFffffFFFF", 16u ) == ulong.max );
        // oct
        assert(parse( "55", 8) == octal!(55) );
        assert(parse( "100", 8) == octal!(100) );
        // bin
        assert(parse( "10000", 2) == 0x10 );
        // trim
        assert(parse( "    \t20") == 20 );
        assert(parse( "    \t-20") == -20 );
        assert(parse( "-    \t 20") == -20 );
        // recognise radix prefix
        assert(parse( "0xFFFF" ) == ushort.max );
        assert(parse( "0XffffFFFF" ) == uint.max );
        assert(parse( "0o55") == octal!(55) );
        assert(parse( "0O55" ) == octal!(55) );
        assert(parse( "0b10000") == 0x10 );
        assert(parse( "0B10000") == 0x10 );

        // prefix tests
        auto str = "0x";
        assert(parse( str[0..1] ) ==  0 );
        assert(parse("0x10", 10) == 0);
        assert(parse("0b10", 10) == 0);
        assert(parse("0o10", 10) == 0);
        assert(parse("0b10") == 0b10);
        assert(parse("0o10") == octal!(10));
        assert(parse("0b10", 2) == 0b10);
        assert(parse("0o10", 8) == octal!(10));

        // revised tests
        assert (format(tmp, 10, "d") == "10");
        assert (format(tmp, -10, "d") == "-10");

        assert (format(tmp, 10L, "u") == "10");
        assert (format(tmp, 10L, "U") == "10");
        assert (format(tmp, 10L, "g") == "10");
        assert (format(tmp, 10L, "G") == "10");
        assert (format(tmp, 10L, "o") == "12");
        assert (format(tmp, 10L, "O") == "12");
        assert (format(tmp, 10L, "b") == "1010");
        assert (format(tmp, 10L, "B") == "1010");
        assert (format(tmp, 10L, "x") == "a");
        assert (format(tmp, 10L, "X") == "A");

        assert (format(tmp, 10L, "d+") == "+10");
        assert (format(tmp, 10L, "d ") == " 10");
        assert (format(tmp, 10L, "d#") == "10");
        assert (format(tmp, 10L, "x#") == "0xa");
        assert (format(tmp, 10L, "X#") == "0XA");
        assert (format(tmp, 10L, "b#") == "0b1010");
        assert (format(tmp, 10L, "o#") == "0o12");

        assert (format(tmp, 10L, "d1") == "10");
        assert (format(tmp, 10L, "d8") == "00000010");
        assert (format(tmp, 10L, "x8") == "0000000a");
        assert (format(tmp, 10L, "X8") == "0000000A");
        assert (format(tmp, 10L, "b8") == "00001010");
        assert (format(tmp, 10L, "o8") == "00000012");

        assert (format(tmp, 10L, "d1#") == "10");
        assert (format(tmp, 10L, "d6#") == "000010");
        assert (format(tmp, 10L, "x6#") == "0x00000a");
        assert (format(tmp, 10L, "X6#") == "0X00000A");

        char[8] tmp1;
        assert (format(tmp1, 10L, "b12#") == "0b001010");
        assert (format(tmp1, 10L, "o12#") == "0o000012");
        }
}

/******************************************************************************

******************************************************************************/

debug (Integer)
{
        import tango.io.Stdout;

        void main()
        {
                char[8] tmp;

                Stdout.formatln ("d '{}'", format(tmp, 10));
                Stdout.formatln ("d '{}'", format(tmp, -10));

                Stdout.formatln ("u '{}'", format(tmp, 10L, "u"));
                Stdout.formatln ("U '{}'", format(tmp, 10L, "U"));
                Stdout.formatln ("g '{}'", format(tmp, 10L, "g"));
                Stdout.formatln ("G '{}'", format(tmp, 10L, "G"));
                Stdout.formatln ("o '{}'", format(tmp, 10L, "o"));
                Stdout.formatln ("O '{}'", format(tmp, 10L, "O"));
                Stdout.formatln ("b '{}'", format(tmp, 10L, "b"));
                Stdout.formatln ("B '{}'", format(tmp, 10L, "B"));
                Stdout.formatln ("x '{}'", format(tmp, 10L, "x"));
                Stdout.formatln ("X '{}'", format(tmp, 10L, "X"));

                Stdout.formatln ("d+ '{}'", format(tmp, 10L, "d+"));
                Stdout.formatln ("ds '{}'", format(tmp, 10L, "d "));
                Stdout.formatln ("d# '{}'", format(tmp, 10L, "d#"));
                Stdout.formatln ("x# '{}'", format(tmp, 10L, "x#"));
                Stdout.formatln ("X# '{}'", format(tmp, 10L, "X#"));
                Stdout.formatln ("b# '{}'", format(tmp, 10L, "b#"));
                Stdout.formatln ("o# '{}'", format(tmp, 10L, "o#"));

                Stdout.formatln ("d1 '{}'", format(tmp, 10L, "d1"));
                Stdout.formatln ("d8 '{}'", format(tmp, 10L, "d8"));
                Stdout.formatln ("x8 '{}'", format(tmp, 10L, "x8"));
                Stdout.formatln ("X8 '{}'", format(tmp, 10L, "X8"));
                Stdout.formatln ("b8 '{}'", format(tmp, 10L, "b8"));
                Stdout.formatln ("o8 '{}'", format(tmp, 10L, "o8"));

                Stdout.formatln ("d1# '{}'", format(tmp, 10L, "d1#"));
                Stdout.formatln ("d6# '{}'", format(tmp, 10L, "d6#"));
                Stdout.formatln ("x6# '{}'", format(tmp, 10L, "x6#"));
                Stdout.formatln ("X6# '{}'", format(tmp, 10L, "X6#"));

                Stdout.formatln ("b12# '{}'", format(tmp, 10L, "b12#"));
                Stdout.formatln ("o12# '{}'", format(tmp, 10L, "o12#")).newline;

                Stdout.formatln (consume("10"));
                Stdout.formatln (consume("0x1f"));
                Stdout.formatln (consume("0.123"));
                Stdout.formatln (consume("0.123", true));
                Stdout.formatln (consume("0.123e-10", true)).newline;

                Stdout.formatln (consume("10  s"));
                Stdout.formatln (consume("0x1f   s"));
                Stdout.formatln (consume("0.123  s"));
                Stdout.formatln (consume("0.123  s", true));
                Stdout.formatln (consume("0.123e-10  s", true)).newline;
        }
}