src/nom/tam/util/ByteParser.java

   1 package nom.tam.util;\r
   2 \r
   3 import java.io.UnsupportedEncodingException;\r
   4 import java.util.logging.Level;\r
   5 import java.util.logging.Logger;\r
   6 \r
   7 /** This class provides routines\r
   8  *  for efficient parsing of data stored in a byte array.\r
   9  *  This routine is optimized (in theory at least!) for efficiency\r
  10  *  rather than accuracy.  The values read in for doubles or floats\r
  11  *  may differ in the last bit or so from the standard input\r
  12  *  utilities, especially in the case where a float is specified\r
  13  * as a very long string of digits (substantially longer than\r
  14  * the precision of the type).\r
  15  * <p>\r
  16  * The get methods generally are available with or without a length\r
  17  * parameter specified.  When a length parameter is specified only\r
  18  * the bytes with the specified range from the current offset will\r
  19  * be search for the number.  If no length is specified, the entire\r
  20  * buffer from the current offset will be searched.\r
  21  * <p>\r
  22  * The getString method returns a string with leading and trailing\r
  23  * white space left intact.  For all other get calls, leading\r
  24  * white space is ignored.  If fillFields is set, then the get\r
  25  * methods check that only white space follows valid data and a\r
  26  * FormatException is thrown if that is not the case.  If\r
  27  * fillFields is not set and valid data is found, then the\r
  28  * methods return having read as much as possible.  E.g., for\r
  29  * the sequence "T123.258E13", a getBoolean, getInteger and\r
  30  * getFloat call would return true, 123, and 2.58e12 when\r
  31  * called in succession.\r
  32  *\r
  33  */\r
  34 public class ByteParser {\r
  35 \r
  36     /** Array being parsed */\r
  37     private byte[] input;\r
  38     /** Current offset into input. */\r
  39     private int offset;\r
  40     /** Length of last parsed value */\r
  41     private int numberLength;\r
  42     /** Did we find a sign last time we checked? */\r
  43     private boolean foundSign;\r
  44     /** Do we fill up fields? */\r
  45     private boolean fillFields = false;\r
  46 \r
  47     /** Construct a parser.\r
  48      * @param input     The byte array to be parsed.\r
  49      *                 Note that the array can be re-used by\r
  50      *                 refilling its contents and resetting the offset.\r
  51      */\r
  52     public ByteParser(byte[] input) {\r
  53         this.input = input;\r
  54         this.offset = 0;\r
  55     }\r
  56 \r
  57     /** Set the buffer for the parser */\r
  58     public void setBuffer(byte[] buf) {\r
  59         this.input = buf;\r
  60         this.offset = 0;\r
  61     }\r
  62 \r
  63     /** Get the buffer being used by the parser */\r
  64     public byte[] getBuffer() {\r
  65         return input;\r
  66     }\r
  67 \r
  68     /** Set the offset into the array.\r
  69      * @param offset    The desired offset from the beginning\r
  70      *                  of the array.\r
  71      */\r
  72     public void setOffset(int offset) {\r
  73         this.offset = offset;\r
  74     }\r
  75 \r
  76     /** Do we require a field to completely fill up the specified\r
  77      * length (with optional leading and trailing white space.\r
  78     @param flag Is filling required?\r
  79      */\r
  80     public void setFillFields(boolean flag) {\r
  81         fillFields = flag;\r
  82     }\r
  83 \r
  84     /** Get the current offset\r
  85     @return The current offset within the buffer.\r
  86      */\r
  87     public int getOffset() {\r
  88         return offset;\r
  89     }\r
  90 \r
  91     /** Get the number of characters used to parse the previous\r
  92      *  number (or the length of the previous String returned).\r
  93      */\r
  94     public int getNumberLength() {\r
  95         return numberLength;\r
  96     }\r
  97 \r
  98     /** Read in the buffer until a double is read.  This will read\r
  99      * the entire buffer if fillFields is set.\r
 100      * @return The value found.\r
 101      */\r
 102     public double getDouble() throws FormatException {\r
 103         return getDouble(input.length - offset);\r
 104     }\r
 105 \r
 106     /** Look for a double in the buffer.\r
 107      * Leading spaces are ignored.\r
 108      * @param length    The maximum number of characters\r
 109      *                  used to parse this number.  If fillFields\r
 110      *                  is specified then exactly only whitespace may follow\r
 111      *                  a valid double value.\r
 112      */\r
 113     public double getDouble(int length) throws FormatException {\r
 114 \r
 115         int startOffset = offset;\r
 116 \r
 117         boolean error = true;\r
 118 \r
 119         double number = 0;\r
 120         int i = 0;\r
 121 \r
 122         // Skip initial blanks.\r
 123         length -= skipWhite(length);\r
 124 \r
 125         if (length == 0) {\r
 126             numberLength = offset - startOffset;\r
 127             return 0;\r
 128         }\r
 129 \r
 130         double mantissaSign = checkSign();\r
 131         if (foundSign) {\r
 132             length -= 1;\r
 133         }\r
 134 \r
 135         // Look for the special strings NaN, Inf,\r
 136         if (length >= 3\r
 137                 && (input[offset] == 'n' || input[offset] == 'N')\r
 138                 && (input[offset + 1] == 'a' || input[offset + 1] == 'A')\r
 139                 && (input[offset + 2] == 'n' || input[offset + 2] == 'N')) {\r
 140 \r
 141             number = Double.NaN;\r
 142             length -= 3;\r
 143             offset += 3;\r
 144 \r
 145             // Look for the longer string first then try the shorter.\r
 146         } else if (length >= 8\r
 147                 && (input[offset] == 'i' || input[offset] == 'I')\r
 148                 && (input[offset + 1] == 'n' || input[offset + 1] == 'N')\r
 149                 && (input[offset + 2] == 'f' || input[offset + 2] == 'F')\r
 150                 && (input[offset + 3] == 'i' || input[offset + 3] == 'I')\r
 151                 && (input[offset + 4] == 'n' || input[offset + 4] == 'N')\r
 152                 && (input[offset + 5] == 'i' || input[offset + 5] == 'I')\r
 153                 && (input[offset + 6] == 't' || input[offset + 6] == 'T')\r
 154                 && (input[offset + 7] == 'y' || input[offset + 7] == 'Y')) {\r
 155             number = Double.POSITIVE_INFINITY;\r
 156             length -= 8;\r
 157             offset += 8;\r
 158 \r
 159         } else if (length >= 3\r
 160                 && (input[offset] == 'i' || input[offset] == 'I')\r
 161                 && (input[offset + 1] == 'n' || input[offset + 1] == 'N')\r
 162                 && (input[offset + 2] == 'f' || input[offset + 2] == 'F')) {\r
 163             number = Double.POSITIVE_INFINITY;\r
 164             length -= 3;\r
 165             offset += 3;\r
 166 \r
 167         } else {\r
 168 \r
 169             number = getBareInteger(length);   // This will update offset\r
 170             length -= numberLength;            // Set by getBareInteger\r
 171 \r
 172             if (numberLength > 0) {\r
 173                 error = false;\r
 174             }\r
 175 \r
 176             // Check for fractional values after decimal\r
 177             if (length > 0 && input[offset] == '.') {\r
 178 \r
 179                 offset += 1;\r
 180                 length -= 1;\r
 181 \r
 182                 double numerator = getBareInteger(length);\r
 183                 if (numerator > 0) {\r
 184                     number += numerator / Math.pow(10., numberLength);\r
 185                 }\r
 186                 length -= numberLength;\r
 187                 if (numberLength > 0) {\r
 188                     error = false;\r
 189                 }\r
 190             }\r
 191 \r
 192             if (error) {\r
 193                 offset = startOffset;\r
 194                 numberLength = 0;\r
 195                 throw new FormatException("Invalid real field");\r
 196             }\r
 197 \r
 198             // Look for an exponent\r
 199             if (length > 0) {\r
 200 \r
 201                 // Our Fortran heritage means that we allow 'D' for the exponent indicator.\r
 202                 if (input[offset] == 'e' || input[offset] == 'E'\r
 203                         || input[offset] == 'd' || input[offset] == 'D') {\r
 204 \r
 205                     offset += 1;\r
 206                     length -= 1;\r
 207                     if (length > 0) {\r
 208                         int sign = checkSign();\r
 209                         if (foundSign) {\r
 210                             length -= 1;\r
 211                         }\r
 212 \r
 213                         int exponent = (int) getBareInteger(length);\r
 214 \r
 215                         // For very small numbers we try to miminize\r
 216                         // effects of denormalization.\r
 217                         if (exponent * sign > -300) {\r
 218                             number *= Math.pow(10., exponent * sign);\r
 219                         } else {\r
 220                             number = 1.e-300 * (number * Math.pow(10., exponent * sign + 300));\r
 221                         }\r
 222                         length -= numberLength;\r
 223                     }\r
 224                 }\r
 225             }\r
 226         }\r
 227 \r
 228         if (fillFields && length > 0) {\r
 229 \r
 230             if (isWhite(length)) {\r
 231                 offset += length;\r
 232             } else {\r
 233                 numberLength = 0;\r
 234                 offset = startOffset;\r
 235                 throw new FormatException("Non-blanks following real.");\r
 236             }\r
 237         }\r
 238 \r
 239         numberLength = offset - startOffset;\r
 240         return mantissaSign * number;\r
 241     }\r
 242 \r
 243     /** Get a floating point value from the buffer.  (see getDouble(int())\r
 244      */\r
 245     public float getFloat() throws FormatException {\r
 246         return (float) getDouble(input.length - offset);\r
 247     }\r
 248 \r
 249     /** Get a floating point value in a region of the buffer */\r
 250     public float getFloat(int length) throws FormatException {\r
 251         return (float) getDouble(length);\r
 252     }\r
 253 \r
 254     /** Convert a region of the buffer to an integer */\r
 255     public int getInt(int length) throws FormatException {\r
 256         int startOffset = offset;\r
 257 \r
 258         length -= skipWhite(length);\r
 259         if (length == 0) {\r
 260             numberLength = offset - startOffset;\r
 261             return 0;\r
 262         }\r
 263 \r
 264         int number = 0;\r
 265         boolean error = true;\r
 266 \r
 267         int sign = checkSign();\r
 268         if (foundSign) {\r
 269             length -= 1;\r
 270         }\r
 271 \r
 272         while (length > 0 && input[offset] >= '0' && input[offset] <= '9') {\r
 273             number = number * 10 + input[offset] - '0';\r
 274             offset += 1;\r
 275             length -= 1;\r
 276             error = false;\r
 277         }\r
 278 \r
 279         if (error) {\r
 280             numberLength = 0;\r
 281             offset = startOffset;\r
 282             throw new FormatException("Invalid Integer");\r
 283         }\r
 284 \r
 285         if (length > 0 && fillFields) {\r
 286             if (isWhite(length)) {\r
 287                 offset += length;\r
 288             } else {\r
 289                 numberLength = 0;\r
 290                 offset = startOffset;\r
 291                 throw new FormatException("Non-white following integer");\r
 292             }\r
 293         }\r
 294 \r
 295         numberLength = offset - startOffset;\r
 296         return sign * number;\r
 297     }\r
 298 \r
 299     /** Look for an integer at the beginning of the buffer */\r
 300     public int getInt() throws FormatException {\r
 301         return getInt(input.length - offset);\r
 302     }\r
 303 \r
 304     /** Look for a long in a specified region of the buffer */\r
 305     public long getLong(int length) throws FormatException {\r
 306 \r
 307         int startOffset = offset;\r
 308 \r
 309         // Skip white space.\r
 310         length -= skipWhite(length);\r
 311         if (length == 0) {\r
 312             numberLength = offset - startOffset;\r
 313             return 0;\r
 314         }\r
 315 \r
 316         long number = 0;\r
 317         boolean error = true;\r
 318 \r
 319         long sign = checkSign();\r
 320         if (foundSign) {\r
 321             length -= 1;\r
 322         }\r
 323 \r
 324         while (length > 0 && input[offset] >= '0' && input[offset] <= '9') {\r
 325             number = number * 10 + input[offset] - '0';\r
 326             error = false;\r
 327             offset += 1;\r
 328             length -= 1;\r
 329         }\r
 330 \r
 331         if (error) {\r
 332             numberLength = 0;\r
 333             offset = startOffset;\r
 334             throw new FormatException("Invalid long number");\r
 335         }\r
 336 \r
 337         if (length > 0 && fillFields) {\r
 338             if (isWhite(length)) {\r
 339                 offset += length;\r
 340             } else {\r
 341                 offset = startOffset;\r
 342                 numberLength = 0;\r
 343                 throw new FormatException("Non-white following long");\r
 344             }\r
 345         }\r
 346         numberLength = offset - startOffset;\r
 347         return sign * number;\r
 348     }\r
 349 \r
 350     /** Get a string\r
 351      * @param length  The length of the string.\r
 352      */\r
 353     public String getString(int length) {\r
 354 \r
 355         String s = AsciiFuncs.asciiString(input, offset, length);\r
 356         offset += length;\r
 357         numberLength = length;\r
 358         return s;\r
 359     }\r
 360 \r
 361     /** Get a boolean value from the beginning of the buffer */\r
 362     public boolean getBoolean() throws FormatException {\r
 363         return getBoolean(input.length - offset);\r
 364     }\r
 365 \r
 366     /** Get a boolean value from a specified region of the buffer */\r
 367     public boolean getBoolean(int length) throws FormatException {\r
 368 \r
 369         int startOffset = offset;\r
 370         length -= skipWhite(length);\r
 371         if (length == 0) {\r
 372             throw new FormatException("Blank boolean field");\r
 373         }\r
 374 \r
 375         boolean value = false;\r
 376         if (input[offset] == 'T' || input[offset] == 't') {\r
 377             value = true;\r
 378         } else if (input[offset] != 'F' && input[offset] != 'f') {\r
 379             numberLength = 0;\r
 380             offset = startOffset;\r
 381             throw new FormatException("Invalid boolean value");\r
 382         }\r
 383         offset += 1;\r
 384         length -= 1;\r
 385 \r
 386         if (fillFields && length > 0) {\r
 387             if (isWhite(length)) {\r
 388                 offset += length;\r
 389             } else {\r
 390                 numberLength = 0;\r
 391                 offset = startOffset;\r
 392                 throw new FormatException("Non-white following boolean");\r
 393             }\r
 394         }\r
 395         numberLength = offset - startOffset;\r
 396         return value;\r
 397     }\r
 398 \r
 399     /** Skip bytes in the buffer */\r
 400     public void skip(int nBytes) {\r
 401         offset += nBytes;\r
 402     }\r
 403 \r
 404     /** Get the integer value starting at the current position.\r
 405      * This routine returns a double rather than an int/long\r
 406      * to enable it to read very long integers (with reduced\r
 407      * precision) such as 111111111111111111111111111111111111111111.\r
 408      * Note that this routine does set numberLength.\r
 409      *\r
 410      * @param length    The maximum number of characters to use.\r
 411      */\r
 412     private double getBareInteger(int length) {\r
 413 \r
 414         int startOffset = offset;\r
 415         double number = 0;\r
 416 \r
 417         while (length > 0 && input[offset] >= '0' && input[offset] <= '9') {\r
 418 \r
 419             number *= 10;\r
 420             number += input[offset] - '0';\r
 421             offset += 1;\r
 422             length -= 1;\r
 423         }\r
 424         numberLength = offset - startOffset;\r
 425         return number;\r
 426     }\r
 427 \r
 428     /** Skip white space.  This routine skips with space in\r
 429      * the input and returns the number of character skipped.\r
 430      * White space is defined as ' ', '\t', '\n' or '\r'\r
 431      *\r
 432      * @param length The maximum number of characters to skip.\r
 433      */\r
 434     public int skipWhite(int length) {\r
 435 \r
 436         int i;\r
 437         for (i = 0; i < length; i += 1) {\r
 438             if (input[offset + i] != ' ' && input[offset + i] != '\t'\r
 439                     && input[offset + i] != '\n' && input[offset + i] != '\r') {\r
 440                 break;\r
 441             }\r
 442         }\r
 443 \r
 444         offset += i;\r
 445         return i;\r
 446 \r
 447     }\r
 448 \r
 449     /** Find the sign for a number .\r
 450      * This routine looks for a sign (+/-) at the current location\r
 451      * and return +1/-1 if one is found, or +1 if not.\r
 452      * The foundSign boolean is set if a sign is found and offset is\r
 453      * incremented.\r
 454      */\r
 455     private int checkSign() {\r
 456 \r
 457         foundSign = false;\r
 458 \r
 459         if (input[offset] == '+') {\r
 460             foundSign = true;\r
 461             offset += 1;\r
 462             return 1;\r
 463         } else if (input[offset] == '-') {\r
 464             foundSign = true;\r
 465             offset += 1;\r
 466             return -1;\r
 467         }\r
 468 \r
 469         return 1;\r
 470     }\r
 471 \r
 472     /** Is a region blank?\r
 473      * @param length The length of the region to be tested\r
 474      */\r
 475     private boolean isWhite(int length) {\r
 476         int oldOffset = offset;\r
 477         boolean value = skipWhite(length) == length;\r
 478         offset = oldOffset;\r
 479         return value;\r
 480     }\r
 481 }\r