diff options
author | hiro <hiro@ee746299-78ed-0310-b773-934348b2243d> | 2005-09-26 10:23:49 +0000 |
---|---|---|
committer | hiro <hiro@ee746299-78ed-0310-b773-934348b2243d> | 2005-09-26 10:23:49 +0000 |
commit | 425a81892d75f869e199df045d80e237852d390d (patch) | |
tree | f4e5da8f6cd7963bcad8879d9c4385898322b472 | |
parent | f5bfca4385a9ce419b9e9386b9f27a41e9b86f9f (diff) |
libsylph/html.c: improved entity references conversion.
git-svn-id: svn://sylpheed.sraoss.jp/sylpheed/trunk@603 ee746299-78ed-0310-b773-934348b2243d
-rw-r--r-- | ChangeLog | 6 | ||||
-rw-r--r-- | ChangeLog.ja | 6 | ||||
-rw-r--r-- | libsylph/html.c | 211 |
3 files changed, 150 insertions, 73 deletions
@@ -1,3 +1,9 @@ +2005-09-26 + + * libsylph/html.c: convert character entity references to UTF-8, + rather than US-ASCII. Use UCS-4 to UTF-8 conversion for numbered + references. + 2005-09-22 * libsylph/utils.[ch] diff --git a/ChangeLog.ja b/ChangeLog.ja index e34ca4da..486c9661 100644 --- a/ChangeLog.ja +++ b/ChangeLog.ja @@ -1,3 +1,9 @@ +2005-09-26 + + * libsylph/html.c: 文字実体参照を US-ASCII でなく UTF-8 に変換する + ようにした。数値による参照で UCS-4 から UTF-8 への変換を使用する + ようにした。 + 2005-09-22 * libsylph/utils.[ch] diff --git a/libsylph/html.c b/libsylph/html.c index afaacf33..94da46df 100644 --- a/libsylph/html.c +++ b/libsylph/html.c @@ -41,74 +41,131 @@ static HTMLSymbol symbol_list[] = { {"<" , "<"}, {">" , ">"}, {"&" , "&"}, - {""" , "\""}, - {" " , " "}, - {"™" , "(TM)"}, + {""" , "\""} +}; - {"™", "(TM)"}, +/*   - ÿ */ +static HTMLSymbol latin_symbol_list[] = { + {" " , " "}, + /* {" " , "\302\240"}, */ + {"¡" , "\302\241"}, + {"¢" , "\302\242"}, + {"£" , "\302\243"}, + {"¤", "\302\244"}, + {"¥" , "\302\245"}, + {"¦", "\302\246"}, + {"§" , "\302\247"}, + {"¨" , "\302\250"}, + {"©" , "\302\251"}, + {"ª" , "\302\252"}, + {"«" , "\302\253"}, + {"¬" , "\302\254"}, + {"­" , "\302\255"}, + {"®" , "\302\256"}, + {"¯" , "\302\257"}, + {"°" , "\302\260"}, + {"&plusm;" , "\302\261"}, + {"²" , "\302\262"}, + {"³" , "\302\263"}, + {"´" , "\302\264"}, + {"µ" , "\302\265"}, + {"¶" , "\302\266"}, + {"·", "\302\267"}, + {"¸" , "\302\270"}, + {"¹" , "\302\271"}, + {"º" , "\302\272"}, + {"»" , "\302\273"}, + {"¼", "\302\274"}, + {"½", "\302\275"}, + {"¾", "\302\276"}, + {"¿", "\302\277"}, + + {"À", "\303\200"}, + {"Á", "\303\201"}, + {"Â" , "\303\202"}, + {"Ã", "\303\203"}, + {"Ä" , "\303\204"}, + {"Å" , "\303\205"}, + {"Æ" , "\303\206"}, + {"Ç", "\303\207"}, + {"È", "\303\210"}, + {"É", "\303\211"}, + {"Ê" , "\303\212"}, + {"Ë" , "\303\213"}, + {"Ì", "\303\214"}, + {"Í", "\303\215"}, + {"Î" , "\303\216"}, + {"Ï" , "\303\217"}, + {"Ð" , "\303\220"}, + {"Ñ", "\303\221"}, + {"Ò", "\303\222"}, + {"Ó", "\303\223"}, + {"Ô" , "\303\224"}, + {"Õ", "\303\225"}, + {"Ö" , "\303\226"}, + {"×" , "\303\227"}, + {"Ø", "\303\230"}, + {"Ù", "\303\231"}, + {"Ú", "\303\232"}, + {"Û" , "\303\233"}, + {"Ü" , "\303\234"}, + {"Ý", "\303\235"}, + {"Þ" , "\303\236"}, + {"ß" , "\303\237"}, + {"à", "\303\240"}, + {"á", "\303\241"}, + {"â" , "\303\242"}, + {"ã", "\303\243"}, + {"ä" , "\303\244"}, + {"å" , "\303\245"}, + {"æ" , "\303\246"}, + {"ç", "\303\247"}, + {"è", "\303\250"}, + {"é", "\303\251"}, + {"ê" , "\303\252"}, + {"ë" , "\303\253"}, + {"ì", "\303\254"}, + {"í", "\303\255"}, + {"î" , "\303\256"}, + {"ï" , "\303\257"}, + {"ð" , "\303\260"}, + {"ñ", "\303\261"}, + {"ò", "\303\262"}, + {"ó", "\303\263"}, + {"ô" , "\303\264"}, + {"õ", "\303\265"}, + {"ö" , "\303\266"}, + {"÷", "\303\267"}, + {"ø", "\303\270"}, + {"ù", "\303\271"}, + {"ú", "\303\272"}, + {"û" , "\303\273"}, + {"ü" , "\303\274"}, + {"ý", "\303\275"}, + {"þ" , "\303\276"}, + {"ÿ" , "\303\277"} }; -static HTMLSymbol ascii_symbol_list[] = { - {"¡" , "^!"}, - {"¦", "|"}, - {"©" , "(C)"}, - {"«" , "<<"}, - {"®" , "(R)"}, - - {"²" , "^2"}, - {"³" , "^3"}, - {"´" , "'"}, - {"¸" , ","}, - {"¹" , "^1"}, - {"»" , ">>"}, - {"¼", "1/4"}, - {"½", "1/2"}, - {"¾", "3/4"}, - {"¿", "^?"}, - - {"À", "A`"}, - {"Á", "A'"}, - {"Â" , "A^"}, - {"Ã", "A~"}, - {"Æ" , "AE"}, - {"È", "E`"}, - {"É", "E'"}, - {"Ê" , "E^"}, - {"Ì", "I`"}, - {"Í", "I'"}, - {"Î" , "I^"}, - - {"Ñ", "N~"}, - {"Ò", "O`"}, - {"Ó", "O'"}, - {"Ô" , "O^"}, - {"Õ", "O~"}, - {"Ù", "U`"}, - {"Ú", "U'"}, - {"Û" , "U^"}, - {"Ý", "Y'"}, - - {"à", "a`"}, - {"á", "a'"}, - {"â" , "a^"}, - {"ã", "a~"}, - {"æ" , "ae"}, - {"è", "e`"}, - {"é", "e'"}, - {"ê" , "e^"}, - {"ì", "i`"}, - {"í", "i'"}, - {"î" , "i^"}, - - {"ñ", "n~"}, - {"ò", "o`"}, - {"ó", "o'"}, - {"ô" , "o^"}, - {"õ", "o~"}, - {"ù", "u`"}, - {"ú", "u'"}, - {"û" , "u^"}, - {"ý", "y'"}, +static HTMLSymbol other_symbol_list[] = { + /* Non-standard? */ + {"…" , "..."}, + {"’" , "'"}, + {"–" , "-"}, + {"™" , "\xe2\x84\xa2"}, + {"œ" , "\xc5\x93"}, + + /* Symbolic characters */ + {"™" , "\xe2\x84\xa2"}, + + /* Latin extended */ + {"Œ" , "\xc5\x92"}, + {"œ" , "\xc5\x93"}, + {"Š", "\xc5\xa0"}, + {"š", "\xc5\xa1"}, + {"Ÿ" , "\xc5\xb8"}, + {"ˆ" , "\xcb\x86"}, + {"˜" , "\xcb\x9c"}, + {"ƒ" , "\xc6\x92"}, }; static GHashTable *default_symbol_table; @@ -158,7 +215,8 @@ HTMLParser *html_parser_new(FILE *fp, CodeConverter *conv) default_symbol_table = g_hash_table_new(g_str_hash, g_str_equal); SYMBOL_TABLE_ADD(default_symbol_table, symbol_list); - SYMBOL_TABLE_ADD(default_symbol_table, ascii_symbol_list); + SYMBOL_TABLE_ADD(default_symbol_table, latin_symbol_list); + SYMBOL_TABLE_ADD(default_symbol_table, other_symbol_list); } #undef SYMBOL_TABLE_ADD @@ -288,11 +346,8 @@ static void html_append_str(HTMLParser *parser, const gchar *str, gint len) if (len == 0) return; if (len < 0) g_string_append(string, str); - else { - gchar *s; - Xstrndup_a(s, str, len, return); - g_string_append(string, s); - } + else + g_string_append_len(string, str, len); parser->empty_line = FALSE; if (string->len > 0 && string->str[string->len - 1] == '\n') { @@ -514,12 +569,22 @@ static void html_parse_special(HTMLParser *parser) } else if (symbol_name[1] == '#' && g_ascii_isdigit(symbol_name[2])) { gint ch; - /* TODO: support other entity references */ ch = atoi(symbol_name + 2); - if (g_ascii_isprint(ch)) { + if (ch < 128 && g_ascii_isprint(ch)) { html_append_char(parser, ch); parser->state = HTML_NORMAL; return; + } else { + /* ISO 10646 to UTF-8 */ + gchar buf[6]; + gint len; + + len = g_unichar_to_utf8((gunichar)ch, buf); + if (len > 0) { + html_append_str(parser, buf, len); + parser->state = HTML_NORMAL; + return; + } } } |