| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181 | <?php/** * Takes a well formed list of tokens and fixes their nesting. * * HTML elements dictate which elements are allowed to be their children, * for example, you can't have a p tag in a span tag.  Other elements have * much more rigorous definitions: tables, for instance, require a specific * order for their elements.  There are also constraints not expressible by * document type definitions, such as the chameleon nature of ins/del * tags and global child exclusions. * * The first major objective of this strategy is to iterate through all * the nodes and determine whether or not their children conform to the * element's definition.  If they do not, the child definition may * optionally supply an amended list of elements that is valid or * require that the entire node be deleted (and the previous node * rescanned). * * The second objective is to ensure that explicitly excluded elements of * an element do not appear in its children.  Code that accomplishes this * task is pervasive through the strategy, though the two are distinct tasks * and could, theoretically, be seperated (although it's not recommended). * * @note Whether or not unrecognized children are silently dropped or *       translated into text depends on the child definitions. * * @todo Enable nodes to be bubbled out of the structure.  This is *       easier with our new algorithm. */class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy{    /**     * @param HTMLPurifier_Token[] $tokens     * @param HTMLPurifier_Config $config     * @param HTMLPurifier_Context $context     * @return array|HTMLPurifier_Token[]     */    public function execute($tokens, $config, $context)    {        //####################################################################//        // Pre-processing        // O(n) pass to convert to a tree, so that we can efficiently        // refer to substrings        $top_node = HTMLPurifier_Arborize::arborize($tokens, $config, $context);        // get a copy of the HTML definition        $definition = $config->getHTMLDefinition();        $excludes_enabled = !$config->get('Core.DisableExcludes');        // setup the context variable 'IsInline', for chameleon processing        // is 'false' when we are not inline, 'true' when it must always        // be inline, and an integer when it is inline for a certain        // branch of the document tree        $is_inline = $definition->info_parent_def->descendants_are_inline;        $context->register('IsInline', $is_inline);        // setup error collector        $e =& $context->get('ErrorCollector', true);        //####################################################################//        // Loop initialization        // stack that contains all elements that are excluded        // it is organized by parent elements, similar to $stack,        // but it is only populated when an element with exclusions is        // processed, i.e. there won't be empty exclusions.        $exclude_stack = array($definition->info_parent_def->excludes);        // variable that contains the start token while we are processing        // nodes. This enables error reporting to do its job        $node = $top_node;        // dummy token        list($token, $d) = $node->toTokenPair();        $context->register('CurrentNode', $node);        $context->register('CurrentToken', $token);        //####################################################################//        // Loop        // We need to implement a post-order traversal iteratively, to        // avoid running into stack space limits.  This is pretty tricky        // to reason about, so we just manually stack-ify the recursive        // variant:        //        //  function f($node) {        //      foreach ($node->children as $child) {        //          f($child);        //      }        //      validate($node);        //  }        //        // Thus, we will represent a stack frame as array($node,        // $is_inline, stack of children)        // e.g. array_reverse($node->children) - already processed        // children.        $parent_def = $definition->info_parent_def;        $stack = array(            array($top_node,                  $parent_def->descendants_are_inline,                  $parent_def->excludes, // exclusions                  0)            );        while (!empty($stack)) {            list($node, $is_inline, $excludes, $ix) = array_pop($stack);            // recursive call            $go = false;            $def = empty($stack) ? $definition->info_parent_def : $definition->info[$node->name];            while (isset($node->children[$ix])) {                $child = $node->children[$ix++];                if ($child instanceof HTMLPurifier_Node_Element) {                    $go = true;                    $stack[] = array($node, $is_inline, $excludes, $ix);                    $stack[] = array($child,                        // ToDo: I don't think it matters if it's def or                        // child_def, but double check this...                        $is_inline || $def->descendants_are_inline,                        empty($def->excludes) ? $excludes                                              : array_merge($excludes, $def->excludes),                        0);                    break;                }            };            if ($go) continue;            list($token, $d) = $node->toTokenPair();            // base case            if ($excludes_enabled && isset($excludes[$node->name])) {                $node->dead = true;                if ($e) $e->send(E_ERROR, 'Strategy_FixNesting: Node excluded');            } else {                // XXX I suppose it would be slightly more efficient to                // avoid the allocation here and have children                // strategies handle it                $children = array();                foreach ($node->children as $child) {                    if (!$child->dead) $children[] = $child;                }                $result = $def->child->validateChildren($children, $config, $context);                if ($result === true) {                    // nop                    $node->children = $children;                } elseif ($result === false) {                    $node->dead = true;                    if ($e) $e->send(E_ERROR, 'Strategy_FixNesting: Node removed');                } else {                    $node->children = $result;                    if ($e) {                        // XXX This will miss mutations of internal nodes. Perhaps defer to the child validators                        if (empty($result) && !empty($children)) {                            $e->send(E_ERROR, 'Strategy_FixNesting: Node contents removed');                        } else if ($result != $children) {                            $e->send(E_WARNING, 'Strategy_FixNesting: Node reorganized');                        }                    }                }            }        }        //####################################################################//        // Post-processing        // remove context variables        $context->destroy('IsInline');        $context->destroy('CurrentNode');        $context->destroy('CurrentToken');        //####################################################################//        // Return        return HTMLPurifier_Arborize::flatten($node, $config, $context);    }}// vim: et sw=4 sts=4
 |