tango.io.UnicodeFile

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244
/*******************************************************************************

        copyright:      Copyright (c) 2005 Kris Bell. All rights reserved

        license:        BSD style: $(LICENSE)

        version:        Initial release: December 2005

        author:         Kris

*******************************************************************************/

module tango.io.UnicodeFile;

private import tango.io.device.File;

public  import tango.text.convert.UnicodeBom;

/*******************************************************************************

        Read and write Unicode files

        For our purposes, Unicode files are an encoding of textual material.
        The goal of this module is to interface that external-encoding with
        a programmer-defined internal-encoding. This internal encoding is
        declared via the template argument T, whilst the external encoding
        is either specified or derived.

        Three internal encodings are supported: char, wchar, and dchar. The
        methods herein operate upon arrays of this type. For example, read()
        returns an array of the type, whilst write() and append() expect an
        array of said type.

        Supported external encodings are as follows:

        $(UL
          $(LI Encoding.Unknown)
          $(LI Encoding.UTF_8)
          $(LI Encoding.UTF_8N)
          $(LI Encoding.UTF_16)
          $(LI Encoding.UTF_16BE)
          $(LI Encoding.UTF_16LE)
          $(LI Encoding.UTF_32)
          $(LI Encoding.UTF_32BE)
          $(LI Encoding.UTF_32LE))

        These can be divided into implicit and explicit encodings. Here is
        the implicit subset:

        $(UL
          $(LI Encoding.Unknown)
          $(LI Encoding.UTF_8)
          $(LI Encoding.UTF_16)
          $(LI Encoding.UTF_32))

        Implicit encodings may be used to 'discover'
        an unknown encoding, by examining the first few bytes of the file
        content for a signature. This signature is optional for all files,
        but is often written such that the content is self-describing. When
        the encoding is unknown, using one of the non-explicit encodings will
        cause the read() method to look for a signature and adjust itself
        accordingly. It is possible that a ZWNBSP character might be confused
        with the signature; today's files are supposed to use the WORD-JOINER
        character instead.

        Explicit encodings are as follows:

        $(UL
          $(LI Encoding.UTF_8N)
          $(LI Encoding.UTF_16BE)
          $(LI Encoding.UTF_16LE)
          $(LI Encoding.UTF_32BE)
          $(LI Encoding.UTF_32LE))

        This group of encodings are for use when the file encoding is
        known. These *must* be used when writing or appending, since written
        content must be in a known format. It should be noted that, during a
        read operation, the presence of a signature is in conflict with these
        explicit varieties.

        Method read() returns the current content of the file, whilst write()
        sets the file content, and file length, to the provided array. Method
        append() adds content to the tail of the file. When appending, it is
        your responsibility to ensure the existing and current encodings are
        correctly matched.

        Methods to inspect the file system, check the status of a file or
        directory, and other facilities are made available via the FilePath
        superclass.

        See these links for more info:
      $(UL
        $(LI $(LINK http://www.utf-8.com/))
        $(LI $(LINK http://www.hackcraft.net/xmlUnicode/))
        $(LI $(LINK http://www.unicode.org/faq/utf_bom.html/))
        $(LI $(LINK http://www.azillionmonkeys.com/qed/unicode.html/))
        $(LI $(LINK http://icu.sourceforge.net/docs/papers/forms_of_unicode/)))

*******************************************************************************/

class UnicodeFile(T)
{
        private UnicodeBom!(T)  bom_;
        private const(char)[]   path_;

        /***********************************************************************

                Construct a UnicodeFile from the provided FilePath. The given
                encoding represents the external file encoding, and should
                be one of the Encoding.* types.

        ***********************************************************************/

        this (const(char)[] path, Encoding encoding)
        {
                bom_ = new UnicodeBom!(T)(encoding);
                path_ = path;
        }

        /***********************************************************************

                Call-site shortcut to create a UnicodeFile instance. This
                enables the same syntax as struct usage, so may expose
                a migration path.

        ***********************************************************************/

        static UnicodeFile opCall (const(char)[] name, Encoding encoding)
        {
                return new UnicodeFile (name, encoding);
        }

        /***********************************************************************

                Return the associated file path.

        ***********************************************************************/

        override immutable(char)[] toString ()
        {
                return path_.idup;
        }

        /***********************************************************************

                Return the current encoding. This is either the originally
                specified encoding, or a derived one obtained by inspecting
                the file content for a bom. The latter is performed as part
                of the read() method.

        ***********************************************************************/

        Encoding encoding ()
        {
                return bom_.encoding;
        }

        /***********************************************************************

                Return the associated bom instance. Use this to find more
                information about the encoding status.

        ***********************************************************************/

        UnicodeBom!(T) bom ()
        {
                return bom_;
        }

        /***********************************************************************

                Return the content of the file. The content is inspected
                for a bom signature, which is stripped. An exception is
                thrown if a signature is present when, according to the
                encoding type, it should not be. Conversely, An exception
                is thrown if there is no known signature where the current
                encoding expects one to be present.

        ***********************************************************************/

        final T[] read ()
        {
                auto content = File.get (path_);
                return bom_.decode (content);
        }

        /***********************************************************************

                Set the file content and length to reflect the given array.
                The content will be encoded accordingly.

        ***********************************************************************/

        final void write (const(T)[] content, bool writeBom)
        {
                // convert to external representation (may throw an exeption)
                void[] converted = bom_.encode (content.dup);

                // open file after conversion ~ in case of exceptions
                scope conduit = new File (path_, File.ReadWriteCreate);
                scope (exit)
                       conduit.close();

                if (writeBom)
                    conduit.write (bom_.signature);

                // and write
                conduit.write (converted);
        }

        /***********************************************************************

                Append content to the file; the content will be encoded
                accordingly.

                Note that it is your responsibility to ensure the
                existing and current encodings are correctly matched.

        ***********************************************************************/

        final void append (const(T)[] content)
        {
                // convert to external representation (may throw an exception)
                File.append (path_, bom_.encode (content.dup));
        }
}


/*******************************************************************************

*******************************************************************************/

debug (UnicodeFile)
{
        import tango.io.Stdout;

        void main()
        {
                auto file = UnicodeFile!(char)("UnicodeFile.d", Encoding.UTF_8);
                auto content = file.read;
                Stdout (content).newline;
        }
}