diff --git a/src/defs.h b/src/defs.h index 50b22e3..9a08c3f 100644 --- a/src/defs.h +++ b/src/defs.h @@ -171,6 +171,7 @@ int qp; int htmltag; int style; + int meta_content_type; int skip_html; int has_to_dump; int has_to_dump_whole_body; diff --git a/src/parser.c b/src/parser.c index a1beb34..1a9aebe 100644 --- a/src/parser.c +++ b/src/parser.c @@ -641,7 +641,7 @@ state->pushed_pointer = 0; memset(state->type, 0, TINYBUFSIZE); - snprintf(state->charset, TINYBUFSIZE-1, "unknown"); + memset(state->charset, 0, TINYBUFSIZE); memset(state->attachment_name_buf, 0, SMALLBUFSIZE); state->anamepos = 0; @@ -684,7 +684,18 @@ if(state->texthtml == 1 && state->message_state == MSG_BODY) markHTML(buf, state); - if(state->texthtml == 1) decodeHTML(buf, state->utf8); + if(state->texthtml == 1){ + size_t buflen = strlen(buf); + decodeHTML(buf, state->utf8); + /* decodeHTML converted some entities to iso-8859-1 */ + if(state->utf8 != 1 && strlen(buf) != buflen){ + /* no charset or us-ascii: switch to iso-8859-1 */ + if (state->charset[0] == 0 || strcasecmp(state->charset, "us-ascii") == 0){ + syslog(LOG_PRIORITY, "%s: assuming iso-8859-1 encoding for HTML (was '%s')", sdata->ttmpfile, state->charset); + snprintf(state->charset, TINYBUFSIZE-1, "ISO8859-1"); + } + } + } /* encode the body if it's not utf-8 encoded */ if(state->message_state == MSG_BODY && state->utf8 != 1){ diff --git a/src/parser.h b/src/parser.h index c943849..ae649e5 100644 --- a/src/parser.h +++ b/src/parser.h @@ -20,7 +20,7 @@ void fixupSoftBreakInQuotedPritableLine(char *buf, struct parser_state *state); void fixupBase64EncodedLine(char *buf, struct parser_state *state); void markHTML(char *buf, struct parser_state *state); -void setStateHTMLStyle(char *htmlbuf, int pos, struct parser_state *state); +void setStateHTML(char *htmlbuf, int pos, struct parser_state *state); void translateLine(unsigned char *p, struct parser_state *state); void fix_email_address_for_sphinx(char *s); void split_email_address(char *s); diff --git a/src/parser_utils.c b/src/parser_utils.c index 5ffc96d..869828f 100644 --- a/src/parser_utils.c +++ b/src/parser_utils.c @@ -40,6 +40,7 @@ state->htmltag = 0; state->style = 0; + state->meta_content_type = 0; state->skip_html = 0; @@ -52,6 +53,7 @@ memset(state->receivedbuf, 0, sizeof(state->receivedbuf)); memset(state->type, 0, TINYBUFSIZE); + memset(state->charset, 0, TINYBUFSIZE); memset(state->attachment_name_buf, 0, SMALLBUFSIZE); state->anamepos = 0; @@ -551,7 +553,7 @@ if(isspace(*s)){ if(j > 0){ - setStateHTMLStyle(html, pos, state); + setStateHTML(html, pos, state); memset(html, 0, SMALLBUFSIZE); j=0; } pos++; @@ -576,23 +578,51 @@ if(j > 0){ strncat(html, " ", SMALLBUFSIZE-1); - setStateHTMLStyle(html, pos, state); + setStateHTML(html, pos, state); memset(html, 0, SMALLBUFSIZE); j=0; } + state->meta_content_type = 0; } } //printf("append last in line:*%s*, html=+%s+, j=%d\n", puf, html, j); - if(j > 0){ setStateHTMLStyle(html, pos, state); } + if(j > 0){ setStateHTML(html, pos, state); } strcpy(buf, puf); } -void setStateHTMLStyle(char *htmlbuf, int pos, struct parser_state *state){ +void setStateHTML(char *htmlbuf, int pos, struct parser_state *state){ if(pos == 0 && strncmp(htmlbuf, "style ", 6) == 0) state->style = 1; if(pos == 0 && strncmp(htmlbuf, "/style ", 7) == 0) state->style = 0; + + if(pos == 0 && state->charset[0] == 0 && strncmp(htmlbuf, "meta ", 5) == 0) state->meta_content_type = 0x1; + if(state->meta_content_type){ + if((state->meta_content_type & 0x2) == 0 && strstr(htmlbuf, "http-equiv=content-type ")) + state->meta_content_type |= 0x2; + + if((state->meta_content_type & 0x4) == 0 && strstr(htmlbuf, "content=text/html;")) + state->meta_content_type |= 0x4; + + if(state->meta_content_type == 0x7){ + char *p, *q; + + p = strstr(htmlbuf, "charset="); + if(p){ + p += 8; + for(q = p; isalnum(*q) || index("-_", *q); q++) + ; + + if(q > p && q-p+1 < (int) sizeof(state->charset)){ + syslog(LOG_PRIORITY, "Changing HTML charset from '%s' to '%*s' due to meta tag", state->charset, (int)(q-p), p); + strncpy(state->charset, p, q-p); + state->charset[q-p+1] = '\0'; + state->meta_content_type = 0; + } + } + } + } }