123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361
/*******************************************************************************

        Copyright: Copyright (C) 2008 Kris Bell.  All rights reserved.

        License:   BSD style: $(LICENSE)

        version:   Aug 2008: Initial release

        Authors:   Kris

*******************************************************************************/

module tango.text.xml.DocEntity;

private import Util = tango.text.Util;

/******************************************************************************

        Convert XML entity patterns to normal characters
        
        <pre>
        &amp; => ;
        &quot; => "
        etc.
        </pre>
        
******************************************************************************/

T[] fromEntity (T) (const(T)[] src, T[] dst = null)
{
        ptrdiff_t delta;
        auto s = src.ptr;
        auto len = src.length;

        // take a peek first to see if there's anything
        if ((delta = Util.indexOf (s, '&', len)) < len)
           {
           // make some room if not enough provided
           if (dst.length < src.length)
               dst.length = src.length;
           auto d = dst.ptr;

           // copy segments over, a chunk at a time
           do {
              d [0 .. delta] = s [0 .. delta];
              len -= delta;
              s += delta;
              d += delta;

              // translate entity
              auto token = 0;

              switch (s[1])
                     {
                      case 'a':
                           if (len > 4 && s[1..5] == "amp;")
                               *d++ = '&', token = 5;
                           else
                           if (len > 5 && s[1..6] == "apos;")
                               *d++ = '\'', token = 6;
                           break;
                           
                      case 'g':
                           if (len > 3 && s[1..4] == "gt;")
                               *d++ = '>', token = 4;
                           break;
                           
                      case 'l':
                           if (len > 3 && s[1..4] == "lt;")
                               *d++ = '<', token = 4;
                           break;
                           
                      case 'q':
                           if (len > 5 && s[1..6] == "quot;")
                               *d++ = '"', token = 6;
                           break;

                      default:
                           break;
                     }

              if (token is 0)
                  *d++ = '&', token = 1;

              s += token, len -= token;
              } while ((delta = Util.indexOf (s, '&', len)) < len);

           // copy tail too
           d [0 .. len] = s [0 .. len];
           return dst [0 .. (d + len) - dst.ptr];
           }

        if(dst.length < src.length)
            dst.length = src.length;
        dst[0..src.length] = src;
        return dst[0..src.length];
}


/******************************************************************************

        Convert XML entity patterns to normal characters
        ---
        &amp; => ;
        &quot => "
        etc
        ---
        
        This variant does not require an interim workspace, and instead
        emits directly via the provided delegate
              
******************************************************************************/

void fromEntity (T) (const(T)[] src, scope void delegate(const(T)[]) emit)
{
        ptrdiff_t delta;
        auto s = src.ptr;
        auto len = src.length;

        // take a peek first to see if there's anything
        if ((delta = Util.indexOf (s, '&', len)) < len)
           {
           // copy segments over, a chunk at a time
           do {
              emit (s [0 .. delta]);
              len -= delta;
              s += delta;

              // translate entity
              auto token = 0;

              switch (s[1])
                     {
                      case 'a':
                           if (len > 4 && s[1..5] == "amp;")
                               emit("&"), token = 5;
                           else
                           if (len > 5 && s[1..6] == "apos;")
                               emit("'"), token = 6;
                           break;
                           
                      case 'g':
                           if (len > 3 && s[1..4] == "gt;")
                               emit(">"), token = 4;
                           break;
                           
                      case 'l':
                           if (len > 3 && s[1..4] == "lt;")
                               emit("<"), token = 4;
                           break;
                           
                      case 'q':
                           if (len > 5 && s[1..6] == "quot;")
                               emit("\""), token = 6;
                           break;

                      default:
                           break;
                     }

              if (token is 0)
                  emit ("&"), token = 1;

              s += token, len -= token;
              } while ((delta = Util.indexOf (s, '&', len)) < len);

           // copy tail too
           emit (s [0 .. len]);
           }
        else
           emit (src);
}


/******************************************************************************

        Convert reserved chars to entities. For example: " => &quot; 

        A slice of the provided output buffer is returned. The output buffer should be sufficiently large to  
        accomodate the converted output, or it will be allocated from the 
        heap instead 
        
******************************************************************************/

T[] toEntity(T) (const(T)[] src, T[] dst = null)
{
        const(T)[]  entity;
        auto s = src.ptr;
        auto t = s;
        auto e = s + src.length;
        auto index = 0;

        while (s < e)
               switch (*s)
                      {
                      case '"':
                           entity = "&quot;";
                           goto common;

                      case '>':
                           entity = "&gt;";
                           goto common;

                      case '<':
                           entity = "&lt;";
                           goto common;

                      case '&':
                           entity = "&amp;";
                           goto common;

                      case '\'':
                           entity = "&apos;";
                           goto common;

                      common:
                           auto len = s - t;
                           if (dst.length <= index + len + entity.length)
                               dst.length = (dst.length + len + entity.length) + dst.length / 2;

                           dst [index .. index + len] = t [0 .. len];
                           index += len;

                           dst [index .. index + entity.length] = entity;
                           index += entity.length;
                           t = ++s;
                           break;

                      default:
                           ++s;
                           break;
                      }


        // did we change anything?
        if (index)
           {
           // copy tail too
           auto len = e - t;
           if (dst.length <= index + len)
               dst.length = index + len;

           dst [index .. index + len] = t [0 .. len];
           return dst [0 .. index + len];
           }

        if(dst.length < src.length)
            dst.length = src.length;
        dst[0..src.length] = src;
        return dst[0..src.length];
}


/******************************************************************************

        Convert reserved chars to entities. For example: " => &quot; 

        This variant does not require an interim workspace, and instead
        emits directly via the provided delegate
        
******************************************************************************/

void toEntity(T) (const(T)[] src, scope void delegate(const(T)[]) emit)
{
        const(T)[]  entity;
        auto s = src.ptr;
        auto t = s;
        auto e = s + src.length;

        while (s < e)
               switch (*s)
                      {
                      case '"':
                           entity = "&quot;";
                           goto common;

                      case '>':
                           entity = "&gt;";
                           goto common;

                      case '<':
                           entity = "&lt;";
                           goto common;

                      case '&':
                           entity = "&amp;";
                           goto common;

                      case '\'':
                           entity = "&apos;";
                           goto common;

                      common:
                           if (s - t > 0)
                               emit (t [0 .. s - t]);
                           emit (entity);
                           t = ++s;
                           break;

                      default:
                           ++s;
                           break;
                      }

        // did we change anything? Copy tail also
        if (entity.length)
            emit (t [0 .. e - t]);
        else
           emit (src);
}



/*******************************************************************************

*******************************************************************************/

debug (DocEntity)
{
        import tango.io.Console;

        void main()
        {
                auto s = fromEntity ("&amp;");
                assert (s == "&");
                s = fromEntity ("&quot;");
                assert (s == "\"");
                s = fromEntity ("&apos;");
                assert (s == "'");
                s = fromEntity ("&gt;");
                assert (s == ">");
                s = fromEntity ("&lt;");
                assert (s == "<");
                s = fromEntity ("&lt;&amp;&apos;");
                assert (s == "<&'");
                s = fromEntity ("*&lt;&amp;&apos;*");
                assert (s == "*<&'*");

                assert (fromEntity ("abc") == "abc");
                assert (fromEntity ("abc&") == "abc&");
                assert (fromEntity ("abc&lt;") == "abc<");
                assert (fromEntity ("abc&gt;goo") == "abc>goo");
                assert (fromEntity ("&amp;") == "&");
                assert (fromEntity ("&quot;&apos;") == "\"'");
                assert (fromEntity ("&q&s") == "&q&s");

                auto d = toEntity (">");
                assert (d == "&gt;");
                d = toEntity ("<");
                assert (d == "&lt;");
                d = toEntity ("&");
                assert (d == "&amp;");
                d = toEntity ("'");
                assert (d == "&apos;");
                d = toEntity ("\"");
                assert (d == "&quot;");
                d = toEntity ("^^>*>*");
                assert (d == "^^&gt;*&gt;*");
        }
}