/* * Copyright (c) 2002, Adam Dunkels. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials provided * with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * This file is part of the Contiki desktop environment * * */ /* htmlparser.c: * * Implements a very simplistic HTML parser. It recognizes HTML links * (-tags), HTML img alt tags, a few text flow break tags G * (
,

, ), the

  • tag (but does not even try to * distinguish between
      or
        ) as well as HTML comment tags * (). * * To save memory, the HTML parser is state machine driver, which * means that it will shave off one character from the HTML page, * process that character, and return to the next. Another way of * doing it would be to buffer a number of characters and process them * together. * * The main function in this file is the htmlparser_parse() function * which takes a htmlparser_state structur and a part of an HTML file * as an argument. The htmlparser_parse() function will call the * helper functions parse_char() and parse_tag(). Those functions will * in turn call the two callback functions htmlparser_char() and * htmlparser_tag(). Those functions must be implemented by the using * module (e.g., a web browser program). * * htmlparser_char() will be called for every non-tag character. * * htmlparser_tag() will be called whenever a full tag has been found. * */ #include #include "contiki.h" #include "html-strings.h" #include "www.h" #include "htmlparser.h" #if 1 #define PRINTF(x) #else #include #define PRINTF(x) printf x #endif /*-----------------------------------------------------------------------------------*/ #define ISO_A 0x41 #define ISO_B 0x42 #define ISO_E 0x45 #define ISO_F 0x46 #define ISO_G 0x47 #define ISO_H 0x48 #define ISO_I 0x49 #define ISO_L 0x4c #define ISO_M 0x4d #define ISO_P 0x50 #define ISO_R 0x52 #define ISO_T 0x54 #define ISO_a (ISO_A | 0x20) #define ISO_b (ISO_B | 0x20) #define ISO_e (ISO_E | 0x20) #define ISO_f (ISO_F | 0x20) #define ISO_g (ISO_G | 0x20) #define ISO_h (ISO_H | 0x20) #define ISO_i (ISO_I | 0x20) #define ISO_l (ISO_L | 0x20) #define ISO_m (ISO_M | 0x20) #define ISO_p (ISO_P | 0x20) #define ISO_r (ISO_R | 0x20) #define ISO_t (ISO_T | 0x20) #define ISO_ht 0x09 #define ISO_nl 0x0a #define ISO_cr 0x0d #define ISO_space 0x20 #define ISO_bang 0x21 #define ISO_citation 0x22 #define ISO_ampersand 0x26 #define ISO_citation2 0x27 #define ISO_asterisk 0x2a #define ISO_dash 0x2d #define ISO_slash 0x2f #define ISO_semicolon 0x3b #define ISO_lt 0x3c #define ISO_eq 0x3d #define ISO_gt 0x3e #define MINORSTATE_NONE 0 #define MINORSTATE_TEXT 1 /* Parse normal text */ #define MINORSTATE_EXTCHAR 2 /* Check for semi-colon */ #define MINORSTATE_TAG 3 /* Check for name of tag. */ #define MINORSTATE_TAGEND 4 /* Scan for end of tag. */ #define MINORSTATE_TAGATTR 5 /* Parse tag attr. */ #define MINORSTATE_TAGATTRSPACE 6 /* Parse optional space after tag attr. */ #define MINORSTATE_TAGATTRPARAM 7 /* Parse tag attr parameter. */ #define MINORSTATE_TAGATTRPARAMNQ 8 /* Parse tag attr parameter without quotation marks. */ #define MINORSTATE_HTMLCOMMENT 9 /* Scan for HTML comment end */ #define MAJORSTATE_NONE 0 #define MAJORSTATE_BODY 1 #define MAJORSTATE_LINK 2 #define MAJORSTATE_FORM 3 #define MAJORSTATE_DISCARD 4 #define MAJORSTATE_SCRIPT 5 struct htmlparser_state { unsigned char minorstate; char tag[20]; unsigned char tagptr; char tagattr[20]; unsigned char tagattrptr; char tagattrparam[WWW_CONF_MAX_URLLEN + 1]; unsigned char tagattrparamptr; unsigned char quotechar; unsigned char majorstate, lastmajorstate; char linkurl[WWW_CONF_MAX_URLLEN + 1]; char word[WWW_CONF_WEBPAGE_WIDTH]; unsigned char wordlen; #if WWW_CONF_FORMS char formaction[WWW_CONF_MAX_FORMACTIONLEN + 1]; unsigned char inputtype; char inputname[WWW_CONF_MAX_INPUTNAMELEN + 1]; char inputvalue[WWW_CONF_MAX_INPUTVALUELEN + 1]; unsigned char inputvaluesize; #endif /* WWW_CONF_FORMS */ }; static struct htmlparser_state s; /*-----------------------------------------------------------------------------------*/ static char last[1] = {(char)0xff}; static const char *tags[] = { #define TAG_FIRST 0 #define TAG_SLASHA 0 html_slasha, #define TAG_SLASHDIV 1 html_slashdiv, #define TAG_SLASHFORM 2 html_slashform, #define TAG_SLASHH 3 html_slashh, #define TAG_SLASHSCRIPT 4 html_slashscript, #define TAG_SLASHSELECT 5 html_slashselect, #define TAG_SLASHSTYLE 6 html_slashstyle, #define TAG_A 7 html_a, #define TAG_BODY 8 html_body, #define TAG_BR 9 html_br, #define TAG_FORM 10 html_form, #define TAG_H1 11 html_h1, #define TAG_H2 12 html_h2, #define TAG_H3 13 html_h3, #define TAG_H4 14 html_h4, #define TAG_IMG 15 html_img, #define TAG_INPUT 16 html_input, #define TAG_LI 17 html_li, #define TAG_P 18 html_p, #define TAG_SCRIPT 19 html_script, #define TAG_SELECT 20 html_select, #define TAG_STYLE 21 html_style, #define TAG_TR 22 html_tr, #define TAG_LAST 23 last, }; /*-----------------------------------------------------------------------------------*/ static unsigned char iswhitespace(char c) { return (c == ISO_space || c == ISO_nl || c == ISO_cr || c == ISO_ht); } /*-----------------------------------------------------------------------------------*/ #if WWW_CONF_FORMS static void init_input(void) { s.inputtype = HTMLPARSER_INPUTTYPE_NONE; s.inputname[0] = s.inputvalue[0] = s.formaction[WWW_CONF_MAX_FORMACTIONLEN] = s.inputname[WWW_CONF_MAX_INPUTNAMELEN] = s.inputvalue[WWW_CONF_MAX_INPUTVALUELEN] = 0; s.inputvaluesize = 20; /* De facto default size */ } #endif /* WWW_CONF_FORMS */ /*-----------------------------------------------------------------------------------*/ void htmlparser_init(void) { s.majorstate = s.lastmajorstate = MAJORSTATE_DISCARD; s.minorstate = MINORSTATE_TEXT; s.wordlen = 0; #if WWW_CONF_FORMS s.formaction[0] = 0; #endif /* WWW_CONF_FORMS */ } /*-----------------------------------------------------------------------------------*/ static char lowercase(char c) { /* XXX: This is a *brute force* approach to lower-case converting and should *not* be used anywhere else! It works for our purposes, however (i.e., HTML tags). */ if(c > 0x40) { return (c & 0x1f) | 0x60; } else { return c; } } /*-----------------------------------------------------------------------------------*/ static void endtagfound(void) { s.tag[s.tagptr] = 0; s.tagattr[s.tagattrptr] = 0; s.tagattrparam[s.tagattrparamptr] = 0; } /*-----------------------------------------------------------------------------------*/ static void switch_majorstate(unsigned char newstate) { if(s.majorstate != newstate) { PRINTF(("Switching state from %d to %d (%d)\n", s.majorstate, newstate, s.lastmajorstate)); s.lastmajorstate = s.majorstate; s.majorstate = newstate; } } /*-----------------------------------------------------------------------------------*/ static void add_char(unsigned char c) { if(s.wordlen < WWW_CONF_WEBPAGE_WIDTH - 1 && c < 0x80) { s.word[s.wordlen] = c; ++s.wordlen; } } /*-----------------------------------------------------------------------------------*/ static void do_word(void) { if(s.wordlen > 0) { if(s.majorstate == MAJORSTATE_LINK) { if(s.word[s.wordlen - 1] != ISO_space) { add_char(ISO_space); } } else if(s.majorstate >= MAJORSTATE_DISCARD) { s.wordlen = 0; } else { s.word[s.wordlen] = '\0'; htmlparser_word(s.word, s.wordlen); s.wordlen = 0; } } } /*-----------------------------------------------------------------------------------*/ static void newline(void) { do_word(); htmlparser_newline(); } /*-----------------------------------------------------------------------------------*/ static unsigned char find_tag(char *tag) { static unsigned char first, last, i, tabi; static char tagc; first = TAG_FIRST; last = TAG_LAST; i = 0; do { tagc = tag[i]; if((tagc == 0 || tagc == ISO_slash) && tags[first][i] == 0) { return first; } tabi = first; /* First, find first matching tag from table. */ while(tagc > (tags[tabi])[i] && tabi < last) { ++tabi; } first = tabi; /* Second, find last matching tag from table. */ while(tagc == (tags[tabi])[i] && tabi < last) { ++tabi; } last = tabi; /* If first and last matching tags are equal, we have a non-match and return. Else we continue with the next character. */ ++i; } while(last != first); return TAG_LAST; } /*-----------------------------------------------------------------------------------*/ static void parse_tag(void) { static char *tagattrparam; static unsigned char tag; static unsigned char size; tag = find_tag(s.tag); /* If we are inside a . */ if(s.majorstate == MAJORSTATE_SCRIPT && tag != TAG_SLASHSCRIPT) { return; } PRINTF(("Parsing tag '%s' '%s' '%s'\n", s.tag, s.tagattr, s.tagattrparam)); switch(tag) { case TAG_P: case TAG_H1: case TAG_H2: case TAG_H3: case TAG_H4: newline(); /* FALLTHROUGH */ case TAG_BR: case TAG_TR: case TAG_SLASHDIV: case TAG_SLASHH: newline(); break; case TAG_LI: if(s.tagattr[0] == 0) { newline(); add_char(ISO_asterisk); add_char(ISO_space); } break; case TAG_SCRIPT: switch_majorstate(MAJORSTATE_SCRIPT); break; case TAG_STYLE: case TAG_SELECT: switch_majorstate(MAJORSTATE_DISCARD); break; case TAG_SLASHSCRIPT: case TAG_SLASHSTYLE: case TAG_SLASHSELECT: do_word(); switch_majorstate(s.lastmajorstate); break; case TAG_BODY: do_word(); s.majorstate = s.lastmajorstate = MAJORSTATE_BODY; break; case TAG_IMG: if(strncmp(s.tagattr, html_alt, sizeof(html_alt)) == 0 && s.tagattrparam[0] != 0) { add_char(ISO_lt); tagattrparam = &s.tagattrparam[0]; while(*tagattrparam) { add_char(*tagattrparam); ++tagattrparam; } add_char(ISO_gt); do_word(); } break; case TAG_A: PRINTF(("A %s %s\n", s.tagattr, s.tagattrparam)); if(strncmp(s.tagattr, html_href, sizeof(html_href)) == 0 && s.tagattrparam[0] != 0) { strcpy(s.linkurl, s.tagattrparam); do_word(); switch_majorstate(MAJORSTATE_LINK); } break; case TAG_SLASHA: if(s.majorstate == MAJORSTATE_LINK) { switch_majorstate(s.lastmajorstate); s.word[s.wordlen] = 0; htmlparser_link(s.word, s.wordlen, s.linkurl); s.wordlen = 0; } break; #if WWW_CONF_FORMS case TAG_FORM: /* First check if we are called at the end of a form tag. If so, we should propagate the form action. */ if(s.tagattr[0] == 0 && s.formaction[0] != 0) { htmlparser_form(s.formaction); init_input(); } else { PRINTF(("Form tag\n")); switch_majorstate(MAJORSTATE_FORM); if(strncmp(s.tagattr, html_action, sizeof(html_action)) == 0) { PRINTF(("Form action '%s'\n", s.tagattrparam)); strncpy(s.formaction, s.tagattrparam, WWW_CONF_MAX_FORMACTIONLEN - 1); } } break; case TAG_SLASHFORM: switch_majorstate(MAJORSTATE_BODY); s.formaction[0] = 0; break; case TAG_INPUT: if(s.majorstate == MAJORSTATE_FORM) { /* First check if we are called at the end of an input tag. If so, we should render the input widget. */ if(s.tagattr[0] == 0 && s.inputname[0] != 0) { PRINTF(("Render input type %d\n", s.inputtype)); switch(s.inputtype) { case HTMLPARSER_INPUTTYPE_NONE: case HTMLPARSER_INPUTTYPE_TEXT: case HTMLPARSER_INPUTTYPE_HIDDEN: htmlparser_inputfield(s.inputtype, s.inputvaluesize, s.inputvalue, s.inputname); break; case HTMLPARSER_INPUTTYPE_SUBMIT: case HTMLPARSER_INPUTTYPE_IMAGE: htmlparser_submitbutton(s.inputvalue, s.inputname); break; } init_input(); } else { PRINTF(("Input '%s' '%s'\n", s.tagattr, s.tagattrparam)); if(strncmp(s.tagattr, html_type, sizeof(html_type)) == 0) { if(strncmp(s.tagattrparam, html_submit, sizeof(html_submit)) == 0) { s.inputtype = HTMLPARSER_INPUTTYPE_SUBMIT; } else if(strncmp(s.tagattrparam, html_image, sizeof(html_image)) == 0) { s.inputtype = HTMLPARSER_INPUTTYPE_IMAGE; } else if(strncmp(s.tagattrparam, html_text, sizeof(html_text)) == 0) { s.inputtype = HTMLPARSER_INPUTTYPE_TEXT; } else if(strncmp(s.tagattrparam, html_hidden, sizeof(html_hidden)) == 0) { s.inputtype = HTMLPARSER_INPUTTYPE_HIDDEN; } else { s.inputtype = HTMLPARSER_INPUTTYPE_OTHER; } } else if(strncmp(s.tagattr, html_name, sizeof(html_name)) == 0) { strncpy(s.inputname, s.tagattrparam, WWW_CONF_MAX_INPUTNAMELEN); } else if(strncmp(s.tagattr, html_alt, sizeof(html_alt)) == 0 && s.inputtype == HTMLPARSER_INPUTTYPE_IMAGE) { strncpy(s.inputvalue, s.tagattrparam, WWW_CONF_MAX_INPUTVALUELEN); } else if(strncmp(s.tagattr, html_value, sizeof(html_value)) == 0) { strncpy(s.inputvalue, s.tagattrparam, WWW_CONF_MAX_INPUTVALUELEN); } else if(strncmp(s.tagattr, html_size, sizeof(html_size)) == 0) { size = 0; if(s.tagattrparam[0] >= '0' && s.tagattrparam[0] <= '9') { size = s.tagattrparam[0] - '0'; if(s.tagattrparam[1] >= '0' && s.tagattrparam[1] <= '9') { size = size * 10 + (s.tagattrparam[1] - '0'); } } if(size >= WWW_CONF_MAX_INPUTVALUELEN) { size = WWW_CONF_MAX_INPUTVALUELEN - 1; } s.inputvaluesize = size; } } } break; #endif /* WWW_CONF_FORMS */ } } /*-----------------------------------------------------------------------------------*/ static uint16_t parse_word(char *data, uint8_t dlen) { static uint8_t i; static uint8_t len; unsigned char c; len = dlen; switch(s.minorstate) { case MINORSTATE_TEXT: for(i = 0; i < len; ++i) { c = data[i]; if(iswhitespace(c)) { do_word(); } else if(c == ISO_lt) { s.minorstate = MINORSTATE_TAG; s.tagptr = 0; break; } else if(c == ISO_ampersand) { s.minorstate = MINORSTATE_EXTCHAR; break; } else { add_char(c); } } break; case MINORSTATE_EXTCHAR: for(i = 0; i < len; ++i) { c = data[i]; if(c == ISO_semicolon) { s.minorstate = MINORSTATE_TEXT; add_char(' '); break; } else if(iswhitespace(c)) { s.minorstate = MINORSTATE_TEXT; add_char('&'); add_char(' '); break; } } break; case MINORSTATE_TAG: /* If we are inside a we mustn't mistake a JavaScript equation with a '<' as a tag. So we check for the very next character to be a '/' as we're only interested in parsing the . */ if(s.majorstate == MAJORSTATE_SCRIPT && data[0] != ISO_slash) { s.minorstate = MINORSTATE_TEXT; break; } /* We are currently parsing within the name of a tag. We check for the end of a tag (the '>' character) or whitespace (which indicates that we should parse a tag attr argument instead). */ for(i = 0; i < len; ++i) { c = data[i]; if(c == ISO_gt) { /* Full tag found. We continue parsing regular text. */ s.minorstate = MINORSTATE_TEXT; s.tagattrptr = s.tagattrparamptr = 0; endtagfound(); parse_tag(); break; } else if(iswhitespace(c)) { /* The name of the tag found. We continue parsing the tag attr.*/ s.minorstate = MINORSTATE_TAGATTR; s.tagattrptr = 0; endtagfound(); break; } else { /* Keep track of the name of the tag, but convert it to lower case. */ s.tag[s.tagptr] = lowercase(c); ++s.tagptr; /* Check if the ->tag field is full. If so, we just eat up any data left in the tag. */ if(s.tagptr == sizeof(s.tag)) { s.minorstate = MINORSTATE_TAGEND; break; } } /* Check for HTML comment, indicated by