diff --git a/WWW/Library/Implementation/SGML.c b/WWW/Library/Implementation/SGML.c index 1f20ee8..5fbea95 100644 --- a/WWW/Library/Implementation/SGML.c +++ b/WWW/Library/Implementation/SGML.c @@ -93,6 +93,11 @@ static void fake_put_character(HTStream *p GCC_UNUSED, /*the following macros are used for pretty source view. */ #define IS_C(attr) (attr.type == HTMLA_CLASS) +#if defined(ISO2022JP_TOUTF8) && defined(EXP_JAPANESEUTF8_SUPPORT) +# define UTF8_TTY_ISO2022JP (me->T.output_utf8) +#else +# define UTF8_TTY_ISO2022JP 0 +#endif HTCJKlang HTCJK = NOCJK; /* CJK enum value. */ BOOL HTPassEightBitRaw = FALSE; /* Pass 161-172,174-255 raw. */ BOOL HTPassEightBitNum = FALSE; /* Pass ^ numeric entities raw. */ @@ -1659,6 +1664,34 @@ static void SGML_character(HTStream *me, int c_in) /* * If we want the raw input converted to Unicode, try that now. - FM */ + /* Convert ISO-2022-JP to Unicode (charset=iso-2022-jp is unrecognized) */ +#define IS_JIS7_HILO(c) (0x20<(c)&&(c)<0x7F) + if (UTF8_TTY_ISO2022JP && (me->state == S_nonascii_text + || me->state == S_nonascii_text_sq + || me->state == S_nonascii_text_dq)) { + /* end of ISO-2022-JP? || not in ISO-2022-JP range */ + if (TOASCII(c) == '\033' || !IS_JIS7_HILO(c)) { + me->kanji_buf = '\0'; + goto top1; + } + if (me->kanji_buf == '\t') { /* flag for single byte kana in "ESC(I" */ + if (conv_jisx0201kana) { + JISx0201TO0208_SJIS(c | 0200, me->U.utf_buf, me->U.utf_buf + 1); + clong = UCTransJPToUni(me->U.utf_buf, 2, UCGetLYhndl_byMIME("shift_jis")); + } else { + clong = UCTransToUni(c | 0200, UCGetLYhndl_byMIME("shift_jis")); + } + } else if (me->kanji_buf) { + me->U.utf_buf[0] = me->kanji_buf | 0200; /* to EUC-JP */ + me->U.utf_buf[1] = c | 0200; + clong = UCTransJPToUni(me->U.utf_buf, 2, UCGetLYhndl_byMIME("euc-jp")); + me->kanji_buf = '\0'; + } else { + me->kanji_buf = c; + clong = ucNeedMore; + } + goto top1; + } if (me->T.trans_to_uni && #ifdef EXP_JAPANESEUTF8_SUPPORT ((strcmp(LYCharSet_UC[me->inUCLYhndl].MIMEname, "euc-jp") == 0) || @@ -1808,7 +1841,8 @@ static void SGML_character(HTStream *me, int c_in) */ if (TOASCII(clong) < 32 && c != '\t' && c != '\n' && c != '\r' && - !IS_CJK_TTY) + !IS_CJK_TTY && + !(UTF8_TTY_ISO2022JP && TOASCII(c) == '\033')) goto after_switch; /* @@ -1916,13 +1950,14 @@ static void SGML_character(HTStream *me, int c_in) me->state = S_in_kanji; me->kanji_buf = c; break; - } else if (IS_CJK_TTY && TOASCII(c) == '\033') { /* S/390 -- gil -- 0881 */ + } else if ((IS_CJK_TTY || UTF8_TTY_ISO2022JP) && TOASCII(c) == '\033') { /* S/390 -- gil -- 0881 */ /* * Setting up for CJK escape sequence handling (based on Takuya * ASADA's (asada@three-a.co.jp) CJK Lynx). - FM */ me->state = S_esc; - PUTC(c); + if (!UTF8_TTY_ISO2022JP) + PUTC(c); break; } @@ -3649,7 +3684,8 @@ static void SGML_character(HTStream *me, int c_in) * - Takuya ASADA (asada@three-a.co.jp) */ me->state = S_esc_sq; - HTChunkPutc(string, c); + if (!UTF8_TTY_ISO2022JP) + HTChunkPutc(string, c); } else if (me->T.decode_utf8 && *me->U.utf_buf) { HTChunkPuts(string, me->U.utf_buf); @@ -3693,7 +3729,8 @@ static void SGML_character(HTStream *me, int c_in) * - Takuya ASADA (asada@three-a.co.jp) */ me->state = S_esc_dq; - HTChunkPutc(string, c); + if (!UTF8_TTY_ISO2022JP) + HTChunkPutc(string, c); } else if (me->T.decode_utf8 && *me->U.utf_buf) { HTChunkPuts(string, me->U.utf_buf); @@ -3956,8 +3993,11 @@ static void SGML_character(HTStream *me, int c_in) me->state = S_paren; } else { me->state = S_text; + if (UTF8_TTY_ISO2022JP) + goto top1; } - PUTC(c); + if (!UTF8_TTY_ISO2022JP) + PUTC(c); break; case S_dollar: /* Expecting '@', 'B', 'A' or '(' after CJK "ESC$". */ @@ -3966,7 +4006,8 @@ static void SGML_character(HTStream *me, int c_in) } else if (c == '(') { me->state = S_dollar_paren; } - PUTC(c); + if (!UTF8_TTY_ISO2022JP) + PUTC(c); break; case S_dollar_paren: /* Expecting 'C' after CJK "ESC$(". */ @@ -3974,8 +4015,13 @@ static void SGML_character(HTStream *me, int c_in) me->state = S_nonascii_text; } else { me->state = S_text; + if (UTF8_TTY_ISO2022JP) { + PUTS("$("); + goto top1; + } } - PUTC(c); + if (!UTF8_TTY_ISO2022JP) + PUTC(c); break; case S_paren: /* Expecting 'B', 'J', 'T' or 'I' after CJK "ESC(". */ @@ -3983,19 +4029,30 @@ static void SGML_character(HTStream *me, int c_in) me->state = S_text; } else if (c == 'I') { me->state = S_nonascii_text; + if (UTF8_TTY_ISO2022JP) + me->kanji_buf = '\t'; /* flag for single byte katakana */ } else { me->state = S_text; + if (UTF8_TTY_ISO2022JP) { + PUTC('('); + goto top1; + } } - PUTC(c); + if (!UTF8_TTY_ISO2022JP) + PUTC(c); break; case S_nonascii_text: /* Expecting CJK ESC after non-ASCII text. */ if (TOASCII(c) == '\033') { /* S/390 -- gil -- 1264 */ me->state = S_esc; - } - PUTC(c); - if (c < 32) + } else if (c < 32) { me->state = S_text; + } + if (UTF8_TTY_ISO2022JP) { + if (TOASCII(c) != '\033') + PUTUTF8(clong); + } else + PUTC(c); break; case S_esc_sq: /* Expecting '$'or '(' following CJK ESC. */ @@ -4005,8 +4062,11 @@ static void SGML_character(HTStream *me, int c_in) me->state = S_paren_sq; } else { me->state = S_squoted; + if (UTF8_TTY_ISO2022JP) + goto top1; } - HTChunkPutc(string, c); + if (!UTF8_TTY_ISO2022JP) + HTChunkPutc(string, c); break; case S_dollar_sq: /* Expecting '@', 'B', 'A' or '(' after CJK "ESC$". */ @@ -4015,7 +4075,8 @@ static void SGML_character(HTStream *me, int c_in) } else if (c == '(') { me->state = S_dollar_paren_sq; } - HTChunkPutc(string, c); + if (!UTF8_TTY_ISO2022JP) + HTChunkPutc(string, c); break; case S_dollar_paren_sq: /* Expecting 'C' after CJK "ESC$(". */ @@ -4023,8 +4084,13 @@ static void SGML_character(HTStream *me, int c_in) me->state = S_nonascii_text_sq; } else { me->state = S_squoted; + if (UTF8_TTY_ISO2022JP) { + HTChunkPuts(string, "$("); + goto top1; + } } - HTChunkPutc(string, c); + if (!UTF8_TTY_ISO2022JP) + HTChunkPutc(string, c); break; case S_paren_sq: /* Expecting 'B', 'J', 'T' or 'I' after CJK "ESC(". */ @@ -4032,17 +4098,28 @@ static void SGML_character(HTStream *me, int c_in) me->state = S_squoted; } else if (c == 'I') { me->state = S_nonascii_text_sq; + if (UTF8_TTY_ISO2022JP) + me->kanji_buf = '\t'; /* flag for single byte katakana */ } else { me->state = S_squoted; + if (UTF8_TTY_ISO2022JP) { + HTChunkPutc(string, '('); + goto top1; + } } - HTChunkPutc(string, c); + if (!UTF8_TTY_ISO2022JP) + HTChunkPutc(string, c); break; case S_nonascii_text_sq: /* Expecting CJK ESC after non-ASCII text. */ if (TOASCII(c) == '\033') { /* S/390 -- gil -- 1281 */ me->state = S_esc_sq; } - HTChunkPutc(string, c); + if (UTF8_TTY_ISO2022JP) { + if (TOASCII(c) != '\033') + HTChunkPutUtf8Char(string, clong); + } else + HTChunkPutc(string, c); break; case S_esc_dq: /* Expecting '$'or '(' following CJK ESC. */ @@ -4052,8 +4129,11 @@ static void SGML_character(HTStream *me, int c_in) me->state = S_paren_dq; } else { me->state = S_dquoted; + if (UTF8_TTY_ISO2022JP) + goto top1; } - HTChunkPutc(string, c); + if (!UTF8_TTY_ISO2022JP) + HTChunkPutc(string, c); break; case S_dollar_dq: /* Expecting '@', 'B', 'A' or '(' after CJK "ESC$". */ @@ -4062,7 +4142,8 @@ static void SGML_character(HTStream *me, int c_in) } else if (c == '(') { me->state = S_dollar_paren_dq; } - HTChunkPutc(string, c); + if (!UTF8_TTY_ISO2022JP) + HTChunkPutc(string, c); break; case S_dollar_paren_dq: /* Expecting 'C' after CJK "ESC$(". */ @@ -4070,8 +4151,13 @@ static void SGML_character(HTStream *me, int c_in) me->state = S_nonascii_text_dq; } else { me->state = S_dquoted; + if (UTF8_TTY_ISO2022JP) { + HTChunkPuts(string, "$("); + goto top1; + } } - HTChunkPutc(string, c); + if (!UTF8_TTY_ISO2022JP) + HTChunkPutc(string, c); break; case S_paren_dq: /* Expecting 'B', 'J', 'T' or 'I' after CJK "ESC(". */ @@ -4079,17 +4165,28 @@ static void SGML_character(HTStream *me, int c_in) me->state = S_dquoted; } else if (c == 'I') { me->state = S_nonascii_text_dq; + if (UTF8_TTY_ISO2022JP) + me->kanji_buf = '\t'; /* flag for single byte katakana */ } else { me->state = S_dquoted; + if (UTF8_TTY_ISO2022JP) { + HTChunkPutc(string, '('); + goto top1; + } } - HTChunkPutc(string, c); + if (!UTF8_TTY_ISO2022JP) + HTChunkPutc(string, c); break; case S_nonascii_text_dq: /* Expecting CJK ESC after non-ASCII text. */ if (TOASCII(c) == '\033') { /* S/390 -- gil -- 1298 */ me->state = S_esc_dq; } - HTChunkPutc(string, c); + if (UTF8_TTY_ISO2022JP) { + if (TOASCII(c) != '\033') + HTChunkPutUtf8Char(string, clong); + } else + HTChunkPutc(string, c); break; case S_junk_tag: