001/* ***** BEGIN LICENSE BLOCK ***** 002 * Version: MPL 1.1/GPL 2.0/LGPL 2.1 003 * 004 * The contents of this file are subject to the Mozilla Public License Version 005 * 1.1 (the "License"); you may not use this file except in compliance with 006 * the License. You may obtain a copy of the License at 007 * http://www.mozilla.org/MPL/ 008 * 009 * Software distributed under the License is distributed on an "AS IS" basis, 010 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 011 * for the specific language governing rights and limitations under the 012 * License. 013 * 014 * The Original Code is part of dcm4che, an implementation of DICOM(TM) in 015 * Java(TM), hosted at http://sourceforge.net/projects/dcm4che. 016 * 017 * The Initial Developer of the Original Code is 018 * Gunter Zeilinger, Huetteldorferstr. 24/10, 1150 Vienna/Austria/Europe. 019 * Portions created by the Initial Developer are Copyright (C) 2010 020 * the Initial Developer. All Rights Reserved. 021 * 022 * Contributor(s): 023 * Gunter Zeilinger <gunterze@gmail.com> 024 * 025 * Alternatively, the contents of this file may be used under the terms of 026 * either the GNU General Public License Version 2 or later (the "GPL"), or 027 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 028 * in which case the provisions of the GPL or the LGPL are applicable instead 029 * of those above. If you wish to allow use of your version of this file only 030 * under the terms of either the GPL or the LGPL, and not to allow others to 031 * use your version of this file under the terms of the MPL, indicate your 032 * decision by deleting the provisions above and replace them with the notice 033 * and other provisions required by the GPL or the LGPL. If you do not delete 034 * the provisions above, a recipient may use your version of this file under 035 * the terms of any one of the MPL, the GPL or the LGPL. 036 * 037 * ***** END LICENSE BLOCK ***** */ 038 039/* ***** BEGIN LICENSE BLOCK ***** 040 * Version: MPL 1.1/GPL 2.0/LGPL 2.1 041 * 042 * The contents of this file are subject to the Mozilla Public License Version 043 * 1.1 (the "License"); you may not use this file except in compliance with 044 * the License. You may obtain a copy of the License at 045 * http://www.mozilla.org/MPL/ 046 * 047 * Software distributed under the License is distributed on an "AS IS" basis, 048 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 049 * for the specific language governing rights and limitations under the 050 * License. 051 * 052 * The Original Code is part of dcm4che, an implementation of DICOM(TM) in 053 * Java(TM), hosted at https://github.com/gunterze/dcm4che. 054 * 055 * The Initial Developer of the Original Code is 056 * Agfa Healthcare. 057 * Portions created by the Initial Developer are Copyright (C) 2011 058 * the Initial Developer. All Rights Reserved. 059 * 060 * Contributor(s): 061 * See @authors listed below 062 * 063 * Alternatively, the contents of this file may be used under the terms of 064 * either the GNU General Public License Version 2 or later (the "GPL"), or 065 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 066 * in which case the provisions of the GPL or the LGPL are applicable instead 067 * of those above. If you wish to allow use of your version of this file only 068 * under the terms of either the GPL or the LGPL, and not to allow others to 069 * use your version of this file under the terms of the MPL, indicate your 070 * decision by deleting the provisions above and replace them with the notice 071 * and other provisions required by the GPL or the LGPL. If you do not delete 072 * the provisions above, a recipient may use your version of this file under 073 * the terms of any one of the MPL, the GPL or the LGPL. 074 * 075 * ***** END LICENSE BLOCK ***** */ 076 077package org.dcm4che3.data; 078 079import java.io.UnsupportedEncodingException; 080import java.lang.ref.SoftReference; 081import java.nio.ByteBuffer; 082import java.nio.CharBuffer; 083import java.nio.charset.*; 084import java.util.Arrays; 085import java.util.StringTokenizer; 086 087/** 088 * @author Gunter Zeilinger <gunterze@gmail.com> 089 */ 090public class SpecificCharacterSet { 091 092 public static final SpecificCharacterSet ASCII = new SpecificCharacterSet(new Codec[]{Codec.ISO_646}); 093 094 public static SpecificCharacterSet DEFAULT = ASCII; 095 096 private static ThreadLocal<SoftReference<Encoder>> cachedEncoder1 = new ThreadLocal<SoftReference<Encoder>>(); 097 private static ThreadLocal<SoftReference<Encoder>> cachedEncoder2 = new ThreadLocal<SoftReference<Encoder>>(); 098 099 protected final Codec[] codecs; 100 protected final String[] dicomCodes; 101 102 private enum Codec { 103 ISO_646("US-ASCII", true, 0x2842, 0, 1), 104 ISO_8859_1("ISO-8859-1", true, 0x2842, 0x2d41, 1), 105 ISO_8859_2("ISO-8859-2", true, 0x2842, 0x2d42, 1), 106 ISO_8859_3("ISO-8859-3", true, 0x2842, 0x2d43, 1), 107 ISO_8859_4("ISO-8859-4", true, 0x2842, 0x2d44, 1), 108 ISO_8859_5("ISO-8859-5", true, 0x2842, 0x2d4c, 1), 109 ISO_8859_6("ISO-8859-6", true, 0x2842, 0x2d47, 1), 110 ISO_8859_7("ISO-8859-7", true, 0x2842, 0x2d46, 1), 111 ISO_8859_8("ISO-8859-8", true, 0x2842, 0x2d48, 1), 112 ISO_8859_9("ISO-8859-9", true, 0x2842, 0x2d4d, 1), 113 JIS_X_201("JIS_X0201", true, 0x284a, 0x2949, 1) { 114 @Override 115 public String toText(String s) { 116 return s.replace('\\', '¥'); 117 } 118 }, 119 TIS_620("TIS-620", true, 0x2842, 0x2d54, 1), 120 JIS_X_208("x-JIS0208", false, 0x2442, 0, 1), 121 JIS_X_212("JIS_X0212-1990", false, 0x242844, 0, 2), 122 KS_X_1001("EUC-KR", false, 0x2842, 0x242943, -1), 123 GB2312("GB2312", false, 0x2842, 0x242941, -1), 124 UTF_8("UTF-8", true, 0, 0, -1), 125 GB18030("GB18030", false, 0, 0, -1); 126 127 private final String charsetName; 128 private final boolean containsASCII; 129 private final int escSeq0; 130 private final int escSeq1; 131 private final int bytesPerChar; 132 133 Codec(String charsetName, boolean containsASCII, int escSeq0, int escSeq1, int bytesPerChar) { 134 this.charsetName = charsetName; 135 this.containsASCII = containsASCII; 136 this.escSeq0 = escSeq0; 137 this.escSeq1 = escSeq1; 138 this.bytesPerChar = bytesPerChar; 139 } 140 141 public static Codec forCode(String code) { 142 if (code == null) 143 return ISO_646; 144 145 switch(last2digits(code)) { 146 case 0: 147 if (code.equals("ISO_IR 100") || code.equals("ISO 2022 IR 100")) 148 return Codec.ISO_8859_1; 149 break; 150 case 1: 151 if (code.equals("ISO_IR 101") || code.equals("ISO 2022 IR 101")) 152 return Codec.ISO_8859_2; 153 break; 154 case 6: 155 if (code.equals("ISO 2022 IR 6")) 156 return Codec.ISO_646; 157 break; 158 case 9: 159 if (code.equals("ISO_IR 109") || code.equals("ISO 2022 IR 109")) 160 return Codec.ISO_8859_3; 161 break; 162 case 10: 163 if (code.equals("ISO_IR 110") || code.equals("ISO 2022 IR 110")) 164 return Codec.ISO_8859_4; 165 break; 166 case 13: 167 if (code.equals("ISO_IR 13") || code.equals("ISO 2022 IR 13")) 168 return Codec.JIS_X_201; 169 break; 170 case 26: 171 if (code.equals("ISO_IR 126") || code.equals("ISO 2022 IR 126")) 172 return Codec.ISO_8859_7; 173 break; 174 case 27: 175 if (code.equals("ISO_IR 127") || code.equals("ISO 2022 IR 127")) 176 return Codec.ISO_8859_6; 177 break; 178 case 30: 179 if (code.equals("GB18030")) 180 return Codec.GB18030; 181 break; 182 case 31: 183 if (code.equals("GBK")) 184 return Codec.GB18030; 185 break; 186 case 38: 187 if (code.equals("ISO_IR 138") || code.equals("ISO 2022 IR 138")) 188 return Codec.ISO_8859_8; 189 break; 190 case 44: 191 if (code.equals("ISO_IR 144") || code.equals("ISO 2022 IR 144")) 192 return Codec.ISO_8859_5; 193 break; 194 case 48: 195 if (code.equals("ISO_IR 148") || code.equals("ISO 2022 IR 148")) 196 return Codec.ISO_8859_9; 197 break; 198 case 49: 199 if (code.equals("ISO 2022 IR 149")) 200 return Codec.KS_X_1001; 201 break; 202 case 58: 203 if (code.equals("ISO 2022 IR 58")) 204 return Codec.GB2312; 205 break; 206 case 59: 207 if (code.equals("ISO 2022 IR 159")) 208 return Codec.JIS_X_212; 209 break; 210 case 66: 211 if (code.equals("ISO_IR 166") || code.equals("ISO 2022 IR 166")) 212 return Codec.TIS_620; 213 break; 214 case 87: 215 if (code.equals("ISO 2022 IR 87")) 216 return Codec.JIS_X_208; 217 break; 218 case 92: 219 if (code.equals("ISO_IR 192")) 220 return Codec.UTF_8; 221 break; 222 } 223 return ISO_646; 224 } 225 226 private static int last2digits(String code) { 227 int len = code.length(); 228 if (len < 2) 229 return -1; 230 char ch1 = code.charAt(len-1); 231 char ch2 = code.charAt(len-2); 232 return (ch2 & 15) * 10 + (ch1 & 15); 233 } 234 235 public byte[] encode(String val) { 236 try { 237 return val.getBytes(charsetName); 238 } catch (UnsupportedEncodingException e) { 239 throw new AssertionError(e); 240 } 241 } 242 243 public String decode(byte[] b, int off, int len) { 244 try { 245 return new String(b, off, len, charsetName); 246 } catch (UnsupportedEncodingException e) { 247 throw new AssertionError(e); 248 } 249 } 250 251 public boolean containsASCII() { 252 return containsASCII; 253 } 254 255 public int getEscSeq0() { 256 return escSeq0; 257 } 258 259 public int getEscSeq1() { 260 return escSeq1; 261 } 262 263 public int getBytesPerChar() { 264 return bytesPerChar; 265 } 266 267 public String toText(String s) { 268 return s; 269 } 270 } 271 272 private static final class Encoder { 273 final Codec codec; 274 final CharsetEncoder encoder; 275 276 public Encoder(Codec codec) { 277 this.codec = codec; 278 this.encoder = Charset.forName(codec.charsetName).newEncoder(); 279 } 280 281 public boolean encode(CharBuffer cb, ByteBuffer bb, int escSeq, 282 CodingErrorAction errorAction) { 283 encoder.onMalformedInput(errorAction) 284 .onUnmappableCharacter(errorAction) 285 .reset(); 286 int cbmark = cb.position(); 287 int bbmark = bb.position(); 288 try { 289 escSeq(bb, escSeq); 290 CoderResult cr = encoder.encode(cb, bb, true); 291 if (!cr.isUnderflow()) 292 cr.throwException(); 293 cr = encoder.flush(bb); 294 if (!cr.isUnderflow()) 295 cr.throwException(); 296 } catch (CharacterCodingException x) { 297 cb.position(cbmark); 298 bb.position(bbmark); 299 return false; 300 } 301 return true; 302 } 303 304 private static void escSeq(ByteBuffer bb, int seq) { 305 if (seq == 0) 306 return; 307 308 bb.put((byte) 0x1b); 309 int b1 = seq >> 16; 310 if (b1 != 0) 311 bb.put((byte) b1); 312 bb.put((byte) (seq >> 8)); 313 bb.put((byte) seq); 314 } 315 316 public byte[] replacement() { 317 return encoder.replacement(); 318 } 319 } 320 321 private static final class ISO2022 extends SpecificCharacterSet { 322 323 private ISO2022(Codec[] charsetInfos, String... codes) { 324 super(charsetInfos, codes); 325 } 326 327 @Override 328 public byte[] encode(String val, String delimiters) { 329 int strlen = val.length(); 330 CharBuffer cb = CharBuffer.wrap(val.toCharArray()); 331 Encoder enc1 = encoder(cachedEncoder1, codecs[0]); 332 byte[] buf = new byte[strlen]; 333 ByteBuffer bb = ByteBuffer.wrap(buf); 334 // try to encode whole string value with character set specified 335 // by value1 of (0008,0005) Specific Character Set 336 if (!enc1.encode(cb, bb, 0, CodingErrorAction.REPORT)) { 337 // split whole string value according VR specific delimiters 338 // and try to encode each component separately 339 Encoder[] encs = new Encoder[codecs.length]; 340 encs[0] = enc1; 341 encs[1] = encoder(cachedEncoder2, codecs[1]); 342 StringTokenizer comps = new StringTokenizer(val, delimiters, true); 343 buf = new byte[2 * strlen + 4 * (comps.countTokens() + 1)]; 344 bb = ByteBuffer.wrap(buf); 345 int[] cur = { 0, 0 }; 346 while (comps.hasMoreTokens()) { 347 String comp = comps.nextToken(); 348 if (comp.length() == 1 && delimiters.indexOf(comp.charAt(0)) >= 0) { // if delimiter 349 activateInitialCharacterSet(bb, cur); 350 bb.put((byte) comp.charAt(0)); 351 continue; 352 } 353 cb = CharBuffer.wrap(comp.toCharArray()); 354 encodeComponent(encs, cb, bb, cur); 355 } 356 activateInitialCharacterSet(bb, cur); 357 } 358 return Arrays.copyOf(buf, bb.position()); 359 } 360 361 private void encodeComponent(Encoder[] encs, CharBuffer cb, ByteBuffer bb, int[] cur) { 362 // try to encode component with current active character of G1 363 if (codecs[cur[1]].getEscSeq1() != 0 && encs[cur[1]].encode(cb, bb, 0, CodingErrorAction.REPORT)) 364 return; 365 366 // try to encode component with current active character set of G0, if different to G1 367 if ((codecs[cur[1]].getEscSeq1() == 0 || codecs[cur[1]].getEscSeq0() != codecs[cur[0]].getEscSeq0()) 368 && encs[cur[0]].encode(cb, bb, 0, CodingErrorAction.REPORT)) 369 return; 370 371 int next = encs.length; 372 while (--next >= 0) { 373 if (encs[next] == null) 374 encs[next] = new Encoder(codecs[next]); 375 if (codecs[next].getEscSeq1() != 0) { 376 if (encs[next].encode(cb, bb, codecs[next].getEscSeq1(), CodingErrorAction.REPORT)) { 377 cur[1] = next; 378 break; 379 } 380 } else { 381 if (encs[next].encode(cb, bb, codecs[next].getEscSeq0(), CodingErrorAction.REPORT)) { 382 cur[0] = next; 383 break; 384 } 385 } 386 } 387 if (next < 0) { 388 if (cb.length() > 1) { 389 for (int i = 0; i < cb.length(); i++) { 390 encodeComponent(encs, cb.subSequence(i, i + 1), bb, cur); 391 } 392 } else { 393 // character could not be encoded with any of the 394 // specified character sets, encode it with the 395 // current character set of G0, using the default 396 // replacement of the character set decoder 397 // for characters which cannot be encoded 398 bb.put(encs[cur[0]].replacement()); 399 } 400 } 401 } 402 403 private void activateInitialCharacterSet(ByteBuffer bb, int[] cur) { 404 if (cur[0] != 0) { 405 Encoder.escSeq(bb, codecs[0].getEscSeq0()); 406 cur[0] = 0; 407 } 408 if (cur[1] != 0) { 409 Encoder.escSeq(bb, codecs[0].getEscSeq1()); 410 cur[1] = 0; 411 } 412 } 413 414 @Override 415 public String decode(byte[] b) { 416 Codec[] codec = { codecs[0], codecs[0] }; 417 int g = 0; 418 int off = 0; 419 int cur = 0; 420 StringBuilder sb = new StringBuilder(b.length); 421 while (cur < b.length) { 422 if (b[cur] == 0x1b) { // ESC 423 if (off < cur) { 424 sb.append(codec[g].decode(b, off, cur - off)); 425 } 426 cur += 3; 427 switch (((b[cur - 2] & 255) << 8) + (b[cur - 1] & 255)) { 428 case 0x2428: 429 if (b[cur++] == 0x44) { 430 codec[0] = Codec.JIS_X_212; 431 } else { // decode invalid ESC sequence as chars 432 sb.append(codec[0].decode(b, cur - 4, 4)); 433 } 434 break; 435 case 0x2429: 436 switch (b[cur++]) { 437 case 0x41: 438 switchCodec(codec, 1, Codec.GB2312); 439 break; 440 case 0x43: 441 switchCodec(codec, 1, Codec.KS_X_1001); 442 break; 443 default: // decode invalid ESC sequence as chars 444 sb.append(codec[0].decode(b, cur - 4, 4)); 445 } 446 break; 447 case 0x2442: 448 codec[0] = Codec.JIS_X_208; 449 break; 450 case 0x2842: 451 switchCodec(codec, 0, Codec.ISO_646); 452 break; 453 case 0x284a: 454 codec[0] = Codec.JIS_X_201; 455 if (codec[1].getEscSeq1() == 0) 456 codec[1] = codec[0]; 457 break; 458 case 0x2949: 459 codec[1] = Codec.JIS_X_201; 460 break; 461 case 0x2d41: 462 switchCodec(codec, 1, Codec.ISO_8859_1); 463 break; 464 case 0x2d42: 465 switchCodec(codec, 1, Codec.ISO_8859_2); 466 break; 467 case 0x2d43: 468 switchCodec(codec, 1, Codec.ISO_8859_3); 469 break; 470 case 0x2d44: 471 switchCodec(codec, 1, Codec.ISO_8859_4); 472 break; 473 case 0x2d46: 474 switchCodec(codec, 1, Codec.ISO_8859_7); 475 break; 476 case 0x2d47: 477 switchCodec(codec, 1, Codec.ISO_8859_6); 478 break; 479 case 0x2d48: 480 switchCodec(codec, 1, Codec.ISO_8859_8); 481 break; 482 case 0x2d4c: 483 switchCodec(codec, 1, Codec.ISO_8859_5); 484 break; 485 case 0x2d4d: 486 switchCodec(codec, 1, Codec.ISO_8859_9); 487 break; 488 case 0x2d54: 489 switchCodec(codec, 1, Codec.TIS_620); 490 break; 491 default: // decode invalid ESC sequence as chars 492 sb.append(codec[0].decode(b, cur - 3, 3)); 493 } 494 off = cur; 495 } else { 496 if (codec[0] != codec[1] && g == (b[cur] < 0 ? 0 : 1)) { 497 if (off < cur) { 498 sb.append(codec[g].decode(b, off, cur - off)); 499 } 500 off = cur; 501 g = 1 - g; 502 } 503 int bytesPerChar = codec[g].getBytesPerChar(); 504 cur += bytesPerChar > 0 ? bytesPerChar : b[cur] < 0 ? 2 : 1; 505 } 506 } 507 if (off < cur) { 508 sb.append(codec[g].decode(b, off, cur - off)); 509 } 510 return sb.toString(); 511 } 512 513 private void switchCodec(Codec[] codecs, int i, Codec codec) { 514 codecs[i] = codec; 515 if (codecs[0].getEscSeq0() == codecs[1].getEscSeq0()) 516 codecs[0] = codecs[1]; 517 } 518 519 } 520 521 public static SpecificCharacterSet getDefaultCharacterSet() { 522 return DEFAULT; 523 } 524 525 public static void setDefaultCharacterSet(String code) { 526 SpecificCharacterSet cs = code != null ? valueOf(code) : ASCII; 527 if (!cs.containsASCII()) 528 throw new IllegalArgumentException("Default Character Set must contain ASCII - " + code); 529 DEFAULT = cs; 530 } 531 532 public static SpecificCharacterSet valueOf(String... codes) { 533 if (codes == null || codes.length == 0) 534 return DEFAULT; 535 536 Codec[] infos = new Codec[codes.length]; 537 for (int i = 0; i < codes.length; i++) 538 infos[i] = Codec.forCode(codes[i]); 539 return codes.length > 1 ? new ISO2022(infos,codes) 540 : new SpecificCharacterSet(infos, codes); 541 } 542 543 public String[] toCodes () { 544 return dicomCodes; 545 } 546 547 private static Encoder encoder(ThreadLocal<SoftReference<Encoder>> tl, 548 Codec codec) { 549 SoftReference<Encoder> sr; 550 Encoder enc; 551 if ((sr = tl.get()) == null || (enc = sr.get()) == null 552 || enc.codec != codec) 553 tl.set(new SoftReference<Encoder>(enc = new Encoder(codec))); 554 return enc; 555 } 556 557 protected SpecificCharacterSet(Codec[] codecs, String... codes) { 558 this.codecs = codecs; 559 this.dicomCodes = codes; 560 } 561 562 public byte[] encode(String val, String delimiters) { 563 return codecs[0].encode(val); 564 } 565 566 public String decode(byte[] val) { 567 return codecs[0].decode(val, 0, val.length); 568 } 569 570 public boolean isUTF8() { 571 return codecs[0].equals(Codec.UTF_8); 572 } 573 574 public boolean isASCII() { 575 return codecs[0].equals(Codec.ISO_646); 576 } 577 578 public boolean containsASCII() { 579 return codecs[0].containsASCII(); 580 } 581 582 public String toText(String s) { 583 return codecs[0].toText(s); 584 } 585 586 @Override public boolean equals(Object other) { 587 588 if (other == null) { 589 return false; 590 } 591 if (getClass() != other.getClass()) { 592 return false; 593 } 594 final SpecificCharacterSet othercs = (SpecificCharacterSet) other; 595 return Arrays.equals(this.codecs,othercs.codecs); 596 } 597 598 @Override 599 public int hashCode() { 600 return Arrays.hashCode(this.codecs); 601 } 602 603}