From 2dfdd6a5f9c413b4727abca44c7dc6701a5e386e Mon Sep 17 00:00:00 2001 From: "Michael Kaufmann (d00p)" Date: Sat, 21 Sep 2013 12:18:55 +0200 Subject: [PATCH] Update HTMLPurifier to current stable version 4.5.0 --- lib/classes/htmlpurifier/VERSION | 2 +- .../library/HTMLPurifier.autoload.php | 5 + .../library/HTMLPurifier.composer.php | 4 + .../library/HTMLPurifier.includes.php | 12 +- .../htmlpurifier/library/HTMLPurifier.php | 6 +- .../library/HTMLPurifier.safe-includes.php | 10 + .../HTMLPurifier/AttrDef/CSS/Background.php | 2 +- .../HTMLPurifier/AttrDef/CSS/FontFamily.php | 149 ++++++++++- .../HTMLPurifier/AttrDef/CSS/Ident.php | 24 ++ .../library/HTMLPurifier/AttrDef/CSS/URI.php | 9 + .../library/HTMLPurifier/AttrDef/Clone.php | 28 ++ .../HTMLPurifier/AttrDef/HTML/Color.php | 3 +- .../library/HTMLPurifier/AttrDef/HTML/ID.php | 22 +- .../library/HTMLPurifier/AttrDef/URI.php | 2 +- .../library/HTMLPurifier/AttrDef/URI/Host.php | 51 +++- .../HTMLPurifier/AttrTransform/Nofollow.php | 45 ++++ .../HTMLPurifier/AttrTransform/SafeParam.php | 3 +- .../AttrTransform/TargetBlank.php | 38 +++ .../library/HTMLPurifier/AttrTypes.php | 14 + .../library/HTMLPurifier/Bootstrap.php | 57 +++-- .../library/HTMLPurifier/CSSDefinition.php | 33 ++- .../library/HTMLPurifier/ChildDef/List.php | 120 +++++++++ .../library/HTMLPurifier/ChildDef/Table.php | 95 ++++++- .../library/HTMLPurifier/Config.php | 240 ++++++++++++++---- .../library/HTMLPurifier/ConfigSchema.php | 8 +- .../HTMLPurifier/ConfigSchema/schema.ser | Bin 13701 -> 14880 bytes .../ConfigSchema/schema/CSS.AllowedFonts.txt | 12 + .../ConfigSchema/schema/CSS.Trusted.txt | 9 + .../schema/Cache.SerializerPermissions.txt | 11 + .../schema/Core.ColorKeywords.txt | 3 +- .../schema/Core.DisableExcludes.txt | 14 + .../ConfigSchema/schema/Core.EnableIDNA.txt | 9 + .../schema/HTML.AllowedComments.txt | 10 + .../schema/HTML.AllowedCommentsRegexp.txt | 15 ++ .../ConfigSchema/schema/HTML.Nofollow.txt | 7 + .../ConfigSchema/schema/HTML.SafeIframe.txt | 13 + .../schema/HTML.SafeScripting.txt | 10 + .../ConfigSchema/schema/HTML.TargetBlank.txt | 8 + .../ConfigSchema/schema/HTML.Trusted.txt | 1 + .../schema/Output.FixInnerHTML.txt | 15 ++ .../schema/URI.SafeIframeRegexp.txt | 22 ++ .../library/HTMLPurifier/Definition.php | 11 + .../DefinitionCache/Serializer.php | 49 ++-- .../library/HTMLPurifier/ElementDef.php | 20 +- .../library/HTMLPurifier/Encoder.php | 165 ++++++++++-- .../HTMLPurifier/EntityLookup/entities.ser | 2 +- .../Filter/ExtractStyleBlocks.php | 176 ++++++++++++- .../library/HTMLPurifier/Generator.php | 51 +++- .../library/HTMLPurifier/HTMLDefinition.php | 2 +- .../library/HTMLPurifier/HTMLModule/Bdo.php | 2 +- .../library/HTMLPurifier/HTMLModule/Forms.php | 5 +- .../HTMLPurifier/HTMLModule/Iframe.php | 38 +++ .../HTMLPurifier/HTMLModule/Legacy.php | 18 +- .../library/HTMLPurifier/HTMLModule/List.php | 14 +- .../library/HTMLPurifier/HTMLModule/Name.php | 2 +- .../HTMLPurifier/HTMLModule/Nofollow.php | 19 ++ .../HTMLPurifier/HTMLModule/SafeEmbed.php | 2 +- .../HTMLPurifier/HTMLModule/SafeObject.php | 1 - .../HTMLPurifier/HTMLModule/SafeScripting.php | 37 +++ .../HTMLPurifier/HTMLModule/Scripting.php | 4 +- .../HTMLPurifier/HTMLModule/Tables.php | 3 + .../HTMLPurifier/HTMLModule/TargetBlank.php | 19 ++ .../HTMLPurifier/HTMLModuleManager.php | 29 ++- .../HTMLPurifier/Injector/RemoveEmpty.php | 5 +- .../library/HTMLPurifier/Lexer.php | 6 +- .../library/HTMLPurifier/Lexer/DOMLex.php | 76 ++++-- .../HTMLPurifier/Strategy/Composite.php | 2 - .../HTMLPurifier/Strategy/FixNesting.php | 22 +- .../HTMLPurifier/Strategy/MakeWellFormed.php | 105 ++++++-- .../Strategy/RemoveForeignElements.php | 31 ++- .../HTMLPurifier/TagTransform/Font.php | 16 +- .../library/HTMLPurifier/Token/Tag.php | 3 +- .../htmlpurifier/library/HTMLPurifier/URI.php | 133 +++++++--- .../library/HTMLPurifier/URIDefinition.php | 14 +- .../library/HTMLPurifier/URIFilter.php | 26 +- .../HTMLPurifier/URIFilter/HostBlacklist.php | 4 + .../library/HTMLPurifier/URIFilter/Munge.php | 9 +- .../HTMLPurifier/URIFilter/SafeIframe.php | 35 +++ .../library/HTMLPurifier/URIScheme.php | 67 ++++- .../library/HTMLPurifier/URIScheme/data.php | 7 +- .../library/HTMLPurifier/URIScheme/file.php | 10 +- .../library/HTMLPurifier/URIScheme/ftp.php | 3 +- .../library/HTMLPurifier/URIScheme/http.php | 3 +- .../library/HTMLPurifier/URIScheme/https.php | 1 + .../library/HTMLPurifier/URIScheme/mailto.php | 4 +- .../library/HTMLPurifier/URIScheme/news.php | 4 +- .../library/HTMLPurifier/URIScheme/nntp.php | 3 +- 87 files changed, 2057 insertions(+), 342 deletions(-) create mode 100644 lib/classes/htmlpurifier/library/HTMLPurifier.composer.php create mode 100644 lib/classes/htmlpurifier/library/HTMLPurifier/AttrDef/CSS/Ident.php create mode 100644 lib/classes/htmlpurifier/library/HTMLPurifier/AttrDef/Clone.php create mode 100644 lib/classes/htmlpurifier/library/HTMLPurifier/AttrTransform/Nofollow.php create mode 100644 lib/classes/htmlpurifier/library/HTMLPurifier/AttrTransform/TargetBlank.php create mode 100644 lib/classes/htmlpurifier/library/HTMLPurifier/ChildDef/List.php create mode 100644 lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/CSS.AllowedFonts.txt create mode 100644 lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/CSS.Trusted.txt create mode 100644 lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/Cache.SerializerPermissions.txt create mode 100644 lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/Core.DisableExcludes.txt create mode 100644 lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/Core.EnableIDNA.txt create mode 100644 lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/HTML.AllowedComments.txt create mode 100644 lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/HTML.AllowedCommentsRegexp.txt create mode 100644 lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/HTML.Nofollow.txt create mode 100644 lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/HTML.SafeIframe.txt create mode 100644 lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/HTML.SafeScripting.txt create mode 100644 lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/HTML.TargetBlank.txt create mode 100644 lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/Output.FixInnerHTML.txt create mode 100644 lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/URI.SafeIframeRegexp.txt create mode 100644 lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/Iframe.php create mode 100644 lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/Nofollow.php create mode 100644 lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/SafeScripting.php create mode 100644 lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/TargetBlank.php create mode 100644 lib/classes/htmlpurifier/library/HTMLPurifier/URIFilter/SafeIframe.php diff --git a/lib/classes/htmlpurifier/VERSION b/lib/classes/htmlpurifier/VERSION index ef8d7569..a84947d6 100644 --- a/lib/classes/htmlpurifier/VERSION +++ b/lib/classes/htmlpurifier/VERSION @@ -1 +1 @@ -4.2.0 \ No newline at end of file +4.5.0 diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier.autoload.php b/lib/classes/htmlpurifier/library/HTMLPurifier.autoload.php index ae93daad..acbc5216 100644 --- a/lib/classes/htmlpurifier/library/HTMLPurifier.autoload.php +++ b/lib/classes/htmlpurifier/library/HTMLPurifier.autoload.php @@ -3,6 +3,7 @@ /** * @file * Convenience file that registers autoload handler for HTML Purifier. + * It also does some sanity checks. */ if (function_exists('spl_autoload_register') && function_exists('spl_autoload_unregister')) { @@ -18,4 +19,8 @@ if (function_exists('spl_autoload_register') && function_exists('spl_autoload_un } } +if (ini_get('zend.ze1_compatibility_mode')) { + trigger_error("HTML Purifier is not compatible with zend.ze1_compatibility_mode; please turn it off", E_USER_ERROR); +} + // vim: et sw=4 sts=4 diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier.composer.php b/lib/classes/htmlpurifier/library/HTMLPurifier.composer.php new file mode 100644 index 00000000..6706f4e3 --- /dev/null +++ b/lib/classes/htmlpurifier/library/HTMLPurifier.composer.php @@ -0,0 +1,4 @@ +mungeRgb($string); // assumes URI doesn't have spaces in it - $bits = explode(' ', strtolower($string)); // bits to process + $bits = explode(' ', $string); // bits to process $caught = array(); $caught['color'] = false; diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/AttrDef/CSS/FontFamily.php b/lib/classes/htmlpurifier/library/HTMLPurifier/AttrDef/CSS/FontFamily.php index 1b7dc608..321d991b 100644 --- a/lib/classes/htmlpurifier/library/HTMLPurifier/AttrDef/CSS/FontFamily.php +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/AttrDef/CSS/FontFamily.php @@ -2,11 +2,43 @@ /** * Validates a font family list according to CSS spec - * @todo whitelisting allowed fonts would be nice */ class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef { + protected $mask = null; + + public function __construct() { + $this->mask = '_- '; + for ($c = 'a'; $c <= 'z'; $c++) $this->mask .= $c; + for ($c = 'A'; $c <= 'Z'; $c++) $this->mask .= $c; + for ($c = '0'; $c <= '9'; $c++) $this->mask .= $c; // cast-y, but should be fine + // special bytes used by UTF-8 + for ($i = 0x80; $i <= 0xFF; $i++) { + // We don't bother excluding invalid bytes in this range, + // because the our restriction of well-formed UTF-8 will + // prevent these from ever occurring. + $this->mask .= chr($i); + } + + /* + PHP's internal strcspn implementation is + O(length of string * length of mask), making it inefficient + for large masks. However, it's still faster than + preg_match 8) + for (p = s1;;) { + spanp = s2; + do { + if (*spanp == c || p == s1_end) { + return p - s1; + } + } while (spanp++ < (s2_end - 1)); + c = *++p; + } + */ + // possible optimization: invert the mask. + } + public function validate($string, $config, $context) { static $generic_names = array( 'serif' => true, @@ -15,6 +47,7 @@ class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef 'fantasy' => true, 'cursive' => true ); + $allowed_fonts = $config->get('CSS.AllowedFonts'); // assume that no font names contain commas in them $fonts = explode(',', $string); @@ -24,7 +57,9 @@ class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef if ($font === '') continue; // match a generic name if (isset($generic_names[$font])) { - $final .= $font . ', '; + if ($allowed_fonts === null || isset($allowed_fonts[$font])) { + $final .= $font . ', '; + } continue; } // match a quoted name @@ -40,6 +75,10 @@ class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef // $font is a pure representation of the font name + if ($allowed_fonts !== null && !isset($allowed_fonts[$font])) { + continue; + } + if (ctype_alnum($font) && $font !== '') { // very simple font, allow it in unharmed $final .= $font . ', '; @@ -50,17 +89,103 @@ class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef // shouldn't show up regardless $font = str_replace(array("\n", "\t", "\r", "\x0C"), ' ', $font); - // These ugly transforms don't pose a security - // risk (as \\ and \" might). We could try to be clever and - // use single-quote wrapping when there is a double quote - // present, but I have choosen not to implement that. - // (warning: this code relies on the selection of quotation - // mark below) - $font = str_replace('\\', '\\5C ', $font); - $font = str_replace('"', '\\22 ', $font); + // Here, there are various classes of characters which need + // to be treated differently: + // - Alphanumeric characters are essentially safe. We + // handled these above. + // - Spaces require quoting, though most parsers will do + // the right thing if there aren't any characters that + // can be misinterpreted + // - Dashes rarely occur, but they fairly unproblematic + // for parsing/rendering purposes. + // The above characters cover the majority of Western font + // names. + // - Arbitrary Unicode characters not in ASCII. Because + // most parsers give little thought to Unicode, treatment + // of these codepoints is basically uniform, even for + // punctuation-like codepoints. These characters can + // show up in non-Western pages and are supported by most + // major browsers, for example: "MS 明朝" is a + // legitimate font-name + // . See + // the CSS3 spec for more examples: + // + // You can see live samples of these on the Internet: + // + // However, most of these fonts have ASCII equivalents: + // for example, 'MS Mincho', and it's considered + // professional to use ASCII font names instead of + // Unicode font names. Thanks Takeshi Terada for + // providing this information. + // The following characters, to my knowledge, have not been + // used to name font names. + // - Single quote. While theoretically you might find a + // font name that has a single quote in its name (serving + // as an apostrophe, e.g. Dave's Scribble), I haven't + // been able to find any actual examples of this. + // Internet Explorer's cssText translation (which I + // believe is invoked by innerHTML) normalizes any + // quoting to single quotes, and fails to escape single + // quotes. (Note that this is not IE's behavior for all + // CSS properties, just some sort of special casing for + // font-family). So a single quote *cannot* be used + // safely in the font-family context if there will be an + // innerHTML/cssText translation. Note that Firefox 3.x + // does this too. + // - Double quote. In IE, these get normalized to + // single-quotes, no matter what the encoding. (Fun + // fact, in IE8, the 'content' CSS property gained + // support, where they special cased to preserve encoded + // double quotes, but still translate unadorned double + // quotes into single quotes.) So, because their + // fixpoint behavior is identical to single quotes, they + // cannot be allowed either. Firefox 3.x displays + // single-quote style behavior. + // - Backslashes are reduced by one (so \\ -> \) every + // iteration, so they cannot be used safely. This shows + // up in IE7, IE8 and FF3 + // - Semicolons, commas and backticks are handled properly. + // - The rest of the ASCII punctuation is handled properly. + // We haven't checked what browsers do to unadorned + // versions, but this is not important as long as the + // browser doesn't /remove/ surrounding quotes (as IE does + // for HTML). + // + // With these results in hand, we conclude that there are + // various levels of safety: + // - Paranoid: alphanumeric, spaces and dashes(?) + // - International: Paranoid + non-ASCII Unicode + // - Edgy: Everything except quotes, backslashes + // - NoJS: Standards compliance, e.g. sod IE. Note that + // with some judicious character escaping (since certain + // types of escaping doesn't work) this is theoretically + // OK as long as innerHTML/cssText is not called. + // We believe that international is a reasonable default + // (that we will implement now), and once we do more + // extensive research, we may feel comfortable with dropping + // it down to edgy. - // complicated font, requires quoting - $final .= "\"$font\", "; // note that this will later get turned into " + // Edgy: alphanumeric, spaces, dashes, underscores and Unicode. Use of + // str(c)spn assumes that the string was already well formed + // Unicode (which of course it is). + if (strspn($font, $this->mask) !== strlen($font)) { + continue; + } + + // Historical: + // In the absence of innerHTML/cssText, these ugly + // transforms don't pose a security risk (as \\ and \" + // might--these escapes are not supported by most browsers). + // We could try to be clever and use single-quote wrapping + // when there is a double quote present, but I have choosen + // not to implement that. (NOTE: you can reduce the amount + // of escapes by one depending on what quoting style you use) + // $font = str_replace('\\', '\\5C ', $font); + // $font = str_replace('"', '\\22 ', $font); + // $font = str_replace("'", '\\27 ', $font); + + // font possibly with spaces, requires quoting + $final .= "'$font', "; } $final = rtrim($final, ', '); if ($final === '') return false; diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/AttrDef/CSS/Ident.php b/lib/classes/htmlpurifier/library/HTMLPurifier/AttrDef/CSS/Ident.php new file mode 100644 index 00000000..779794a0 --- /dev/null +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/AttrDef/CSS/Ident.php @@ -0,0 +1,24 @@ +clone = $clone; + } + + public function validate($v, $config, $context) { + return $this->clone->validate($v, $config, $context); + } + + public function make($string) { + return clone $this->clone; + } + +} + +// vim: et sw=4 sts=4 diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/AttrDef/HTML/Color.php b/lib/classes/htmlpurifier/library/HTMLPurifier/AttrDef/HTML/Color.php index 0575d8c3..347cca59 100644 --- a/lib/classes/htmlpurifier/library/HTMLPurifier/AttrDef/HTML/Color.php +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/AttrDef/HTML/Color.php @@ -14,7 +14,8 @@ class HTMLPurifier_AttrDef_HTML_Color extends HTMLPurifier_AttrDef $string = trim($string); if (empty($string)) return false; - if (isset($colors[$string])) return $colors[$string]; + $lower = strtolower($string); + if (isset($colors[$lower])) return $colors[$lower]; if ($string[0] === '#') $hex = substr($string, 1); else $hex = $string; diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/AttrDef/HTML/ID.php b/lib/classes/htmlpurifier/library/HTMLPurifier/AttrDef/HTML/ID.php index 373529f7..18a34d23 100644 --- a/lib/classes/htmlpurifier/library/HTMLPurifier/AttrDef/HTML/ID.php +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/AttrDef/HTML/ID.php @@ -12,12 +12,22 @@ class HTMLPurifier_AttrDef_HTML_ID extends HTMLPurifier_AttrDef { - // ref functionality disabled, since we also have to verify - // whether or not the ID it refers to exists + // selector is NOT a valid thing to use for IDREFs, because IDREFs + // *must* target IDs that exist, whereas selector #ids do not. + + /** + * Determines whether or not we're validating an ID in a CSS + * selector context. + */ + protected $selector; + + public function __construct($selector = false) { + $this->selector = $selector; + } public function validate($id, $config, $context) { - if (!$config->get('Attr.EnableID')) return false; + if (!$this->selector && !$config->get('Attr.EnableID')) return false; $id = trim($id); // trim it first @@ -33,10 +43,10 @@ class HTMLPurifier_AttrDef_HTML_ID extends HTMLPurifier_AttrDef '%Attr.IDPrefix is set', E_USER_WARNING); } - //if (!$this->ref) { + if (!$this->selector) { $id_accumulator =& $context->get('IDAccumulator'); if (isset($id_accumulator->ids[$id])) return false; - //} + } // we purposely avoid using regex, hopefully this is faster @@ -56,7 +66,7 @@ class HTMLPurifier_AttrDef_HTML_ID extends HTMLPurifier_AttrDef return false; } - if (/*!$this->ref && */$result) $id_accumulator->add($id); + if (!$this->selector && $result) $id_accumulator->add($id); // if no change was made to the ID, return the result // else, return the new id if stripping whitespace made it diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/AttrDef/URI.php b/lib/classes/htmlpurifier/library/HTMLPurifier/AttrDef/URI.php index 0f861c0d..48d27a90 100644 --- a/lib/classes/htmlpurifier/library/HTMLPurifier/AttrDef/URI.php +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/AttrDef/URI.php @@ -19,7 +19,7 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef } public function make($string) { - $embeds = (bool) $string; + $embeds = ($string === 'embedded'); return new HTMLPurifier_AttrDef_URI($embeds); } diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/AttrDef/URI/Host.php b/lib/classes/htmlpurifier/library/HTMLPurifier/AttrDef/URI/Host.php index 7b84a2b8..6f306616 100644 --- a/lib/classes/htmlpurifier/library/HTMLPurifier/AttrDef/URI/Host.php +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/AttrDef/URI/Host.php @@ -23,6 +23,12 @@ class HTMLPurifier_AttrDef_URI_Host extends HTMLPurifier_AttrDef public function validate($string, $config, $context) { $length = strlen($string); + // empty hostname is OK; it's usually semantically equivalent: + // the default host as defined by a URI scheme is used: + // + // If the URI scheme defines a default for host, then that + // default applies when the host subcomponent is undefined + // or when the registered name is empty (zero length). if ($string === '') return ''; if ($length > 1 && $string[0] === '[' && $string[$length-1] === ']') { //IPv6 @@ -38,9 +44,8 @@ class HTMLPurifier_AttrDef_URI_Host extends HTMLPurifier_AttrDef // A regular domain name. - // This breaks I18N domain names, but we don't have proper IRI support, - // so force users to insert Punycode. If there's complaining we'll - // try to fix things into an international friendly form. + // This doesn't match I18N domain names, but we don't have proper IRI support, + // so force users to insert Punycode. // The productions describing this are: $a = '[a-z]'; // alpha @@ -51,10 +56,44 @@ class HTMLPurifier_AttrDef_URI_Host extends HTMLPurifier_AttrDef // toplabel = alpha | alpha *( alphanum | "-" ) alphanum $toplabel = "$a($and*$an)?"; // hostname = *( domainlabel "." ) toplabel [ "." ] - $match = preg_match("/^($domainlabel\.)*$toplabel\.?$/i", $string); - if (!$match) return false; + if (preg_match("/^($domainlabel\.)*$toplabel\.?$/i", $string)) { + return $string; + } - return $string; + // If we have Net_IDNA2 support, we can support IRIs by + // punycoding them. (This is the most portable thing to do, + // since otherwise we have to assume browsers support + + if ($config->get('Core.EnableIDNA')) { + $idna = new Net_IDNA2(array('encoding' => 'utf8', 'overlong' => false, 'strict' => true)); + // we need to encode each period separately + $parts = explode('.', $string); + try { + $new_parts = array(); + foreach ($parts as $part) { + $encodable = false; + for ($i = 0, $c = strlen($part); $i < $c; $i++) { + if (ord($part[$i]) > 0x7a) { + $encodable = true; + break; + } + } + if (!$encodable) { + $new_parts[] = $part; + } else { + $new_parts[] = $idna->encode($part); + } + } + $string = implode('.', $new_parts); + if (preg_match("/^($domainlabel\.)*$toplabel\.?$/i", $string)) { + return $string; + } + } catch (Exception $e) { + // XXX error reporting + } + } + + return false; } } diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/AttrTransform/Nofollow.php b/lib/classes/htmlpurifier/library/HTMLPurifier/AttrTransform/Nofollow.php new file mode 100644 index 00000000..e699c79a --- /dev/null +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/AttrTransform/Nofollow.php @@ -0,0 +1,45 @@ +parser = new HTMLPurifier_URIParser(); + } + + public function transform($attr, $config, $context) { + + if (!isset($attr['href'])) { + return $attr; + } + + // XXX Kind of inefficient + $url = $this->parser->parse($attr['href']); + $scheme = $url->getSchemeObj($config, $context); + + if ($scheme->browsable && !$url->isLocal($config, $context)) { + if (isset($attr['rel'])) { + $rels = explode(' ', $attr['rel']); + if (!in_array('nofollow', $rels)) { + $rels[] = 'nofollow'; + } + $attr['rel'] = implode(' ', $rels); + } else { + $attr['rel'] = 'nofollow'; + } + } + + return $attr; + + } + +} + +// vim: et sw=4 sts=4 diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/AttrTransform/SafeParam.php b/lib/classes/htmlpurifier/library/HTMLPurifier/AttrTransform/SafeParam.php index 91f67b08..21ac90b8 100644 --- a/lib/classes/htmlpurifier/library/HTMLPurifier/AttrTransform/SafeParam.php +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/AttrTransform/SafeParam.php @@ -19,6 +19,7 @@ class HTMLPurifier_AttrTransform_SafeParam extends HTMLPurifier_AttrTransform public function __construct() { $this->uri = new HTMLPurifier_AttrDef_URI(true); // embedded + $this->wmode = new HTMLPurifier_AttrDef_Enum(array('window', 'opaque', 'transparent')); } public function transform($attr, $config, $context) { @@ -41,7 +42,7 @@ class HTMLPurifier_AttrTransform_SafeParam extends HTMLPurifier_AttrTransform } break; case 'wmode': - $attr['value'] = 'window'; + $attr['value'] = $this->wmode->validate($attr['value'], $config, $context); break; case 'movie': case 'src': diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/AttrTransform/TargetBlank.php b/lib/classes/htmlpurifier/library/HTMLPurifier/AttrTransform/TargetBlank.php new file mode 100644 index 00000000..deba8b40 --- /dev/null +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/AttrTransform/TargetBlank.php @@ -0,0 +1,38 @@ +parser = new HTMLPurifier_URIParser(); + } + + public function transform($attr, $config, $context) { + + if (!isset($attr['href'])) { + return $attr; + } + + // XXX Kind of inefficient + $url = $this->parser->parse($attr['href']); + $scheme = $url->getSchemeObj($config, $context); + + if ($scheme->browsable && !$url->isBenign($config, $context)) { + $attr['target'] = '_blank'; + } + + return $attr; + + } + +} + +// vim: et sw=4 sts=4 diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/AttrTypes.php b/lib/classes/htmlpurifier/library/HTMLPurifier/AttrTypes.php index 82022559..3a65d349 100644 --- a/lib/classes/htmlpurifier/library/HTMLPurifier/AttrTypes.php +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/AttrTypes.php @@ -15,6 +15,13 @@ class HTMLPurifier_AttrTypes * types. */ public function __construct() { + // XXX This is kind of poor, since we don't actually /clone/ + // instances; instead, we use the supplied make() attribute. So, + // the underlying class must know how to deal with arguments. + // With the old implementation of Enum, that ignored its + // arguments when handling a make dispatch, the IAlign + // definition wouldn't work. + // pseudo-types, must be instantiated via shorthand $this->info['Enum'] = new HTMLPurifier_AttrDef_Enum(); $this->info['Bool'] = new HTMLPurifier_AttrDef_HTML_Bool(); @@ -29,6 +36,9 @@ class HTMLPurifier_AttrTypes $this->info['URI'] = new HTMLPurifier_AttrDef_URI(); $this->info['LanguageCode'] = new HTMLPurifier_AttrDef_Lang(); $this->info['Color'] = new HTMLPurifier_AttrDef_HTML_Color(); + $this->info['IAlign'] = self::makeEnum('top,middle,bottom,left,right'); + $this->info['LAlign'] = self::makeEnum('top,bottom,left,right'); + $this->info['FrameTarget'] = new HTMLPurifier_AttrDef_HTML_FrameTarget(); // unimplemented aliases $this->info['ContentType'] = new HTMLPurifier_AttrDef_Text(); @@ -44,6 +54,10 @@ class HTMLPurifier_AttrTypes $this->info['Number'] = new HTMLPurifier_AttrDef_Integer(false, false, true); } + private static function makeEnum($in) { + return new HTMLPurifier_AttrDef_Clone(new HTMLPurifier_AttrDef_Enum(explode(',', $in))); + } + /** * Retrieves a type * @param $type String type name diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/Bootstrap.php b/lib/classes/htmlpurifier/library/HTMLPurifier/Bootstrap.php index 47b15e1e..ce6c2a6b 100644 --- a/lib/classes/htmlpurifier/library/HTMLPurifier/Bootstrap.php +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/Bootstrap.php @@ -37,7 +37,12 @@ class HTMLPurifier_Bootstrap public static function autoload($class) { $file = HTMLPurifier_Bootstrap::getPath($class); if (!$file) return false; - require HTMLPURIFIER_PREFIX . '/' . $file; + // Technically speaking, it should be ok and more efficient to + // just do 'require', but Antonio Parraga reports that with + // Zend extensions such as Zend debugger and APC, this invariant + // may be broken. Since we have efficient alternatives, pay + // the cost here and avoid the bug. + require_once HTMLPURIFIER_PREFIX . '/' . $file; return true; } @@ -65,31 +70,37 @@ class HTMLPurifier_Bootstrap if ( ($funcs = spl_autoload_functions()) === false ) { spl_autoload_register($autoload); } elseif (function_exists('spl_autoload_unregister')) { - $compat = version_compare(PHP_VERSION, '5.1.2', '<=') && - version_compare(PHP_VERSION, '5.1.0', '>='); - foreach ($funcs as $func) { - if (is_array($func)) { - // :TRICKY: There are some compatibility issues and some - // places where we need to error out - $reflector = new ReflectionMethod($func[0], $func[1]); - if (!$reflector->isStatic()) { - throw new Exception(' - HTML Purifier autoloader registrar is not compatible - with non-static object methods due to PHP Bug #44144; - Please do not use HTMLPurifier.autoload.php (or any - file that includes this file); instead, place the code: - spl_autoload_register(array(\'HTMLPurifier_Bootstrap\', \'autoload\')) - after your own autoloaders. - '); + if (version_compare(PHP_VERSION, '5.3.0', '>=')) { + // prepend flag exists, no need for shenanigans + spl_autoload_register($autoload, true, true); + } else { + $buggy = version_compare(PHP_VERSION, '5.2.11', '<'); + $compat = version_compare(PHP_VERSION, '5.1.2', '<=') && + version_compare(PHP_VERSION, '5.1.0', '>='); + foreach ($funcs as $func) { + if ($buggy && is_array($func)) { + // :TRICKY: There are some compatibility issues and some + // places where we need to error out + $reflector = new ReflectionMethod($func[0], $func[1]); + if (!$reflector->isStatic()) { + throw new Exception(' + HTML Purifier autoloader registrar is not compatible + with non-static object methods due to PHP Bug #44144; + Please do not use HTMLPurifier.autoload.php (or any + file that includes this file); instead, place the code: + spl_autoload_register(array(\'HTMLPurifier_Bootstrap\', \'autoload\')) + after your own autoloaders. + '); + } + // Suprisingly, spl_autoload_register supports the + // Class::staticMethod callback format, although call_user_func doesn't + if ($compat) $func = implode('::', $func); } - // Suprisingly, spl_autoload_register supports the - // Class::staticMethod callback format, although call_user_func doesn't - if ($compat) $func = implode('::', $func); + spl_autoload_unregister($func); } - spl_autoload_unregister($func); + spl_autoload_register($autoload); + foreach ($funcs as $func) spl_autoload_register($func); } - spl_autoload_register($autoload); - foreach ($funcs as $func) spl_autoload_register($func); } } diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/CSSDefinition.php b/lib/classes/htmlpurifier/library/HTMLPurifier/CSSDefinition.php index 478d6d4a..7fdbe3a6 100644 --- a/lib/classes/htmlpurifier/library/HTMLPurifier/CSSDefinition.php +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/CSSDefinition.php @@ -208,8 +208,9 @@ class HTMLPurifier_CSSDefinition extends HTMLPurifier_Definition $this->info['border-spacing'] = new HTMLPurifier_AttrDef_CSS_Multiple(new HTMLPurifier_AttrDef_CSS_Length(), 2); - // partial support - $this->info['white-space'] = new HTMLPurifier_AttrDef_Enum(array('nowrap')); + // These CSS properties don't work on many browsers, but we live + // in THE FUTURE! + $this->info['white-space'] = new HTMLPurifier_AttrDef_Enum(array('nowrap', 'normal', 'pre', 'pre-wrap', 'pre-line')); if ($config->get('CSS.Proprietary')) { $this->doSetupProprietary($config); @@ -219,6 +220,10 @@ class HTMLPurifier_CSSDefinition extends HTMLPurifier_Definition $this->doSetupTricky($config); } + if ($config->get('CSS.Trusted')) { + $this->doSetupTrusted($config); + } + $allow_important = $config->get('CSS.AllowImportant'); // wrap all attr-defs with decorator that handles !important foreach ($this->info as $k => $v) { @@ -245,12 +250,17 @@ class HTMLPurifier_CSSDefinition extends HTMLPurifier_Definition // only opacity, for now $this->info['filter'] = new HTMLPurifier_AttrDef_CSS_Filter(); + // more CSS3 + $this->info['page-break-after'] = + $this->info['page-break-before'] = new HTMLPurifier_AttrDef_Enum(array('auto','always','avoid','left','right')); + $this->info['page-break-inside'] = new HTMLPurifier_AttrDef_Enum(array('auto','avoid')); + } protected function doSetupTricky($config) { $this->info['display'] = new HTMLPurifier_AttrDef_Enum(array( 'inline', 'block', 'list-item', 'run-in', 'compact', - 'marker', 'table', 'inline-table', 'table-row-group', + 'marker', 'table', 'inline-block', 'inline-table', 'table-row-group', 'table-header-group', 'table-footer-group', 'table-row', 'table-column-group', 'table-column', 'table-cell', 'table-caption', 'none' )); @@ -260,6 +270,23 @@ class HTMLPurifier_CSSDefinition extends HTMLPurifier_Definition $this->info['overflow'] = new HTMLPurifier_AttrDef_Enum(array('visible', 'hidden', 'auto', 'scroll')); } + protected function doSetupTrusted($config) { + $this->info['position'] = new HTMLPurifier_AttrDef_Enum(array( + 'static', 'relative', 'absolute', 'fixed' + )); + $this->info['top'] = + $this->info['left'] = + $this->info['right'] = + $this->info['bottom'] = new HTMLPurifier_AttrDef_CSS_Composite(array( + new HTMLPurifier_AttrDef_CSS_Length(), + new HTMLPurifier_AttrDef_CSS_Percentage(), + new HTMLPurifier_AttrDef_Enum(array('auto')), + )); + $this->info['z-index'] = new HTMLPurifier_AttrDef_CSS_Composite(array( + new HTMLPurifier_AttrDef_Integer(), + new HTMLPurifier_AttrDef_Enum(array('auto')), + )); + } /** * Performs extra config-based processing. Based off of diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/ChildDef/List.php b/lib/classes/htmlpurifier/library/HTMLPurifier/ChildDef/List.php new file mode 100644 index 00000000..cdaa2893 --- /dev/null +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/ChildDef/List.php @@ -0,0 +1,120 @@ + true, 'ul' => true, 'ol' => true); + public function validateChildren($tokens_of_children, $config, $context) { + // Flag for subclasses + $this->whitespace = false; + + // if there are no tokens, delete parent node + if (empty($tokens_of_children)) return false; + + // the new set of children + $result = array(); + + // current depth into the nest + $nesting = 0; + + // a little sanity check to make sure it's not ALL whitespace + $all_whitespace = true; + + $seen_li = false; + $need_close_li = false; + + foreach ($tokens_of_children as $token) { + if (!empty($token->is_whitespace)) { + $result[] = $token; + continue; + } + $all_whitespace = false; // phew, we're not talking about whitespace + + if ($nesting == 1 && $need_close_li) { + $result[] = new HTMLPurifier_Token_End('li'); + $nesting--; + $need_close_li = false; + } + + $is_child = ($nesting == 0); + + if ($token instanceof HTMLPurifier_Token_Start) { + $nesting++; + } elseif ($token instanceof HTMLPurifier_Token_End) { + $nesting--; + } + + if ($is_child) { + if ($token->name === 'li') { + // good + $seen_li = true; + } elseif ($token->name === 'ul' || $token->name === 'ol') { + // we want to tuck this into the previous li + $need_close_li = true; + $nesting++; + if (!$seen_li) { + // create a new li element + $result[] = new HTMLPurifier_Token_Start('li'); + } else { + // backtrack until found + while(true) { + $t = array_pop($result); + if ($t instanceof HTMLPurifier_Token_End) { + // XXX actually, these invariants could very plausibly be violated + // if we are doing silly things with modifying the set of allowed elements. + // FORTUNATELY, it doesn't make a difference, since the allowed + // elements are hard-coded here! + if ($t->name !== 'li') { + trigger_error("Only li present invariant violated in List ChildDef", E_USER_ERROR); + return false; + } + break; + } elseif ($t instanceof HTMLPurifier_Token_Empty) { // bleagh + if ($t->name !== 'li') { + trigger_error("Only li present invariant violated in List ChildDef", E_USER_ERROR); + return false; + } + // XXX this should have a helper for it... + $result[] = new HTMLPurifier_Token_Start('li', $t->attr, $t->line, $t->col, $t->armor); + break; + } else { + if (!$t->is_whitespace) { + trigger_error("Only whitespace present invariant violated in List ChildDef", E_USER_ERROR); + return false; + } + } + } + } + } else { + // start wrapping (this doesn't precisely mimic + // browser behavior, but what browsers do is kind of + // hard to mimic in a standards compliant way + // XXX Actually, this has no impact in practice, + // because this gets handled earlier. Arguably, + // we should rip out all of that processing + $result[] = new HTMLPurifier_Token_Start('li'); + $nesting++; + $seen_li = true; + $need_close_li = true; + } + } + $result[] = $token; + } + if ($need_close_li) { + $result[] = new HTMLPurifier_Token_End('li'); + } + if (empty($result)) return false; + if ($all_whitespace) { + return false; + } + if ($tokens_of_children == $result) return true; + return $result; + } +} + +// vim: et sw=4 sts=4 diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/ChildDef/Table.php b/lib/classes/htmlpurifier/library/HTMLPurifier/ChildDef/Table.php index 7e6e435b..de2a4553 100644 --- a/lib/classes/htmlpurifier/library/HTMLPurifier/ChildDef/Table.php +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/ChildDef/Table.php @@ -1,7 +1,33 @@ s with a . foreach ($tokens_of_children as $token) { $is_child = ($nesting == 0); @@ -51,8 +79,9 @@ class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef // okay, let's stash the tokens away // first token tells us the type of the collection switch ($collection[$tag_index]->name) { - case 'tr': case 'tbody': + $tbody_mode = true; + case 'tr': $content[] = $collection; break; case 'caption': @@ -61,13 +90,28 @@ class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef break; case 'thead': case 'tfoot': + $tbody_mode = true; + // XXX This breaks rendering properties with + // Firefox, which never floats a to + // the top. Ever. (Our scheme will float the + // first to the top.) So maybe + // s that are not first should be + // turned into ? Very tricky, indeed. + // access the appropriate variable, $thead or $tfoot $var = $collection[$tag_index]->name; if ($$var === false) { $$var = $collection; } else { - // transmutate the first and less entries into - // tbody tags, and then put into content + // Oops, there's a second one! What + // should we do? Current behavior is to + // transmutate the first and last entries into + // tbody tags, and then put into content. + // Maybe a better idea is to *attach + // it* to the existing thead or tfoot? + // We don't do this, because Firefox + // doesn't float an extra tfoot to the + // bottom like it does for the first one. $collection[$tag_index]->name = 'tbody'; $collection[count($collection)-1]->name = 'tbody'; $content[] = $collection; @@ -126,7 +170,48 @@ class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef if ($cols !== false) foreach ($cols as $token_array) $ret = array_merge($ret, $token_array); if ($thead !== false) $ret = array_merge($ret, $thead); if ($tfoot !== false) $ret = array_merge($ret, $tfoot); - foreach ($content as $token_array) $ret = array_merge($ret, $token_array); + + if ($tbody_mode) { + // a little tricky, since the start of the collection may be + // whitespace + $inside_tbody = false; + foreach ($content as $token_array) { + // find the starting token + foreach ($token_array as $t) { + if ($t->name === 'tr' || $t->name === 'tbody') { + break; + } + } // iterator variable carries over + if ($t->name === 'tr') { + if ($inside_tbody) { + $ret = array_merge($ret, $token_array); + } else { + $ret[] = new HTMLPurifier_Token_Start('tbody'); + $ret = array_merge($ret, $token_array); + $inside_tbody = true; + } + } elseif ($t->name === 'tbody') { + if ($inside_tbody) { + $ret[] = new HTMLPurifier_Token_End('tbody'); + $inside_tbody = false; + $ret = array_merge($ret, $token_array); + } else { + $ret = array_merge($ret, $token_array); + } + } else { + trigger_error("tr/tbody in content invariant failed in Table ChildDef", E_USER_ERROR); + } + } + if ($inside_tbody) { + $ret[] = new HTMLPurifier_Token_End('tbody'); + } + } else { + foreach ($content as $token_array) { + // invariant: everything in here is s + $ret = array_merge($ret, $token_array); + } + } + if (!empty($collection) && $is_collecting == false){ // grab the trailing space $ret = array_merge($ret, $collection); diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/Config.php b/lib/classes/htmlpurifier/library/HTMLPurifier/Config.php index 3461c9f8..ec98f52d 100644 --- a/lib/classes/htmlpurifier/library/HTMLPurifier/Config.php +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/Config.php @@ -20,7 +20,7 @@ class HTMLPurifier_Config /** * HTML Purifier's version */ - public $version = '4.2.0'; + public $version = '4.5.0'; /** * Bool indicator whether or not to automatically finalize @@ -44,7 +44,7 @@ class HTMLPurifier_Config /** * Parser for variables */ - protected $parser; + protected $parser = null; /** * Reference HTMLPurifier_ConfigSchema for value checking @@ -76,7 +76,8 @@ class HTMLPurifier_Config /** * Set to false if you do not want line and file numbers in errors - * (useful when unit testing) + * (useful when unit testing). This will also compress some errors + * and exceptions. */ public $chatty = true; @@ -188,7 +189,7 @@ class HTMLPurifier_Config } /** - * Returns a md5 signature of a segment of the configuration object + * Returns a SHA-1 signature of a segment of the configuration object * that uniquely identifies that particular configuration * @note Revision is handled specially and is removed from the batch * before processing! @@ -198,18 +199,18 @@ class HTMLPurifier_Config if (empty($this->serials[$namespace])) { $batch = $this->getBatch($namespace); unset($batch['DefinitionRev']); - $this->serials[$namespace] = md5(serialize($batch)); + $this->serials[$namespace] = sha1(serialize($batch)); } return $this->serials[$namespace]; } /** - * Returns a md5 signature for the entire configuration object + * Returns a SHA-1 signature for the entire configuration object * that uniquely identifies that particular configuration */ public function getSerial() { if (empty($this->serial)) { - $this->serial = md5(serialize($this->getAll())); + $this->serial = sha1(serialize($this->getAll())); } return $this->serial; } @@ -318,26 +319,64 @@ class HTMLPurifier_Config * Retrieves object reference to the HTML definition. * @param $raw Return a copy that has not been setup yet. Must be * called before it's been setup, otherwise won't work. + * @param $optimized If true, this method may return null, to + * indicate that a cached version of the modified + * definition object is available and no further edits + * are necessary. Consider using + * maybeGetRawHTMLDefinition, which is more explicitly + * named, instead. */ - public function getHTMLDefinition($raw = false) { - return $this->getDefinition('HTML', $raw); + public function getHTMLDefinition($raw = false, $optimized = false) { + return $this->getDefinition('HTML', $raw, $optimized); } /** * Retrieves object reference to the CSS definition * @param $raw Return a copy that has not been setup yet. Must be * called before it's been setup, otherwise won't work. + * @param $optimized If true, this method may return null, to + * indicate that a cached version of the modified + * definition object is available and no further edits + * are necessary. Consider using + * maybeGetRawCSSDefinition, which is more explicitly + * named, instead. */ - public function getCSSDefinition($raw = false) { - return $this->getDefinition('CSS', $raw); + public function getCSSDefinition($raw = false, $optimized = false) { + return $this->getDefinition('CSS', $raw, $optimized); + } + + /** + * Retrieves object reference to the URI definition + * @param $raw Return a copy that has not been setup yet. Must be + * called before it's been setup, otherwise won't work. + * @param $optimized If true, this method may return null, to + * indicate that a cached version of the modified + * definition object is available and no further edits + * are necessary. Consider using + * maybeGetRawURIDefinition, which is more explicitly + * named, instead. + */ + public function getURIDefinition($raw = false, $optimized = false) { + return $this->getDefinition('URI', $raw, $optimized); } /** * Retrieves a definition * @param $type Type of definition: HTML, CSS, etc * @param $raw Whether or not definition should be returned raw + * @param $optimized Only has an effect when $raw is true. Whether + * or not to return null if the result is already present in + * the cache. This is off by default for backwards + * compatibility reasons, but you need to do things this + * way in order to ensure that caching is done properly. + * Check out enduser-customize.html for more details. + * We probably won't ever change this default, as much as the + * maybe semantics is the "right thing to do." */ - public function getDefinition($type, $raw = false) { + public function getDefinition($type, $raw = false, $optimized = false) { + if ($optimized && !$raw) { + throw new HTMLPurifier_Exception("Cannot set optimized = true when raw = false"); + } if (!$this->finalized) $this->autoFinalize(); // temporarily suspend locks, so we can handle recursive definition calls $lock = $this->lock; @@ -346,52 +385,137 @@ class HTMLPurifier_Config $cache = $factory->create($type, $this); $this->lock = $lock; if (!$raw) { - // see if we can quickly supply a definition + // full definition + // --------------- + // check if definition is in memory if (!empty($this->definitions[$type])) { - if (!$this->definitions[$type]->setup) { - $this->definitions[$type]->setup($this); - $cache->set($this->definitions[$type], $this); + $def = $this->definitions[$type]; + // check if the definition is setup + if ($def->setup) { + return $def; + } else { + $def->setup($this); + if ($def->optimized) $cache->add($def, $this); + return $def; } - return $this->definitions[$type]; } - // memory check missed, try cache - $this->definitions[$type] = $cache->get($this); - if ($this->definitions[$type]) { - // definition in cache, return it - return $this->definitions[$type]; + // check if definition is in cache + $def = $cache->get($this); + if ($def) { + // definition in cache, save to memory and return it + $this->definitions[$type] = $def; + return $def; } - } elseif ( - !empty($this->definitions[$type]) && - !$this->definitions[$type]->setup - ) { - // raw requested, raw in memory, quick return - return $this->definitions[$type]; + // initialize it + $def = $this->initDefinition($type); + // set it up + $this->lock = $type; + $def->setup($this); + $this->lock = null; + // save in cache + $cache->add($def, $this); + // return it + return $def; + } else { + // raw definition + // -------------- + // check preconditions + $def = null; + if ($optimized) { + if (is_null($this->get($type . '.DefinitionID'))) { + // fatally error out if definition ID not set + throw new HTMLPurifier_Exception("Cannot retrieve raw version without specifying %$type.DefinitionID"); + } + } + if (!empty($this->definitions[$type])) { + $def = $this->definitions[$type]; + if ($def->setup && !$optimized) { + $extra = $this->chatty ? " (try moving this code block earlier in your initialization)" : ""; + throw new HTMLPurifier_Exception("Cannot retrieve raw definition after it has already been setup" . $extra); + } + if ($def->optimized === null) { + $extra = $this->chatty ? " (try flushing your cache)" : ""; + throw new HTMLPurifier_Exception("Optimization status of definition is unknown" . $extra); + } + if ($def->optimized !== $optimized) { + $msg = $optimized ? "optimized" : "unoptimized"; + $extra = $this->chatty ? " (this backtrace is for the first inconsistent call, which was for a $msg raw definition)" : ""; + throw new HTMLPurifier_Exception("Inconsistent use of optimized and unoptimized raw definition retrievals" . $extra); + } + } + // check if definition was in memory + if ($def) { + if ($def->setup) { + // invariant: $optimized === true (checked above) + return null; + } else { + return $def; + } + } + // if optimized, check if definition was in cache + // (because we do the memory check first, this formulation + // is prone to cache slamming, but I think + // guaranteeing that either /all/ of the raw + // setup code or /none/ of it is run is more important.) + if ($optimized) { + // This code path only gets run once; once we put + // something in $definitions (which is guaranteed by the + // trailing code), we always short-circuit above. + $def = $cache->get($this); + if ($def) { + // save the full definition for later, but don't + // return it yet + $this->definitions[$type] = $def; + return null; + } + } + // check invariants for creation + if (!$optimized) { + if (!is_null($this->get($type . '.DefinitionID'))) { + if ($this->chatty) { + $this->triggerError("Due to a documentation error in previous version of HTML Purifier, your definitions are not being cached. If this is OK, you can remove the %$type.DefinitionRev and %$type.DefinitionID declaration. Otherwise, modify your code to use maybeGetRawDefinition, and test if the returned value is null before making any edits (if it is null, that means that a cached version is available, and no raw operations are necessary). See Customize for more details", E_USER_WARNING); + } else { + $this->triggerError("Useless DefinitionID declaration", E_USER_WARNING); + } + } + } + // initialize it + $def = $this->initDefinition($type); + $def->optimized = $optimized; + return $def; } + throw new HTMLPurifier_Exception("The impossible happened!"); + } + + private function initDefinition($type) { // quick checks failed, let's create the object if ($type == 'HTML') { - $this->definitions[$type] = new HTMLPurifier_HTMLDefinition(); + $def = new HTMLPurifier_HTMLDefinition(); } elseif ($type == 'CSS') { - $this->definitions[$type] = new HTMLPurifier_CSSDefinition(); + $def = new HTMLPurifier_CSSDefinition(); } elseif ($type == 'URI') { - $this->definitions[$type] = new HTMLPurifier_URIDefinition(); + $def = new HTMLPurifier_URIDefinition(); } else { throw new HTMLPurifier_Exception("Definition of $type type not supported"); } - // quick abort if raw - if ($raw) { - if (is_null($this->get($type . '.DefinitionID'))) { - // fatally error out if definition ID not set - throw new HTMLPurifier_Exception("Cannot retrieve raw version without specifying %$type.DefinitionID"); - } - return $this->definitions[$type]; - } - // set it up - $this->lock = $type; - $this->definitions[$type]->setup($this); - $this->lock = null; - // save in cache - $cache->set($this->definitions[$type], $this); - return $this->definitions[$type]; + $this->definitions[$type] = $def; + return $def; + } + + public function maybeGetRawDefinition($name) { + return $this->getDefinition($name, true, true); + } + + public function maybeGetRawHTMLDefinition() { + return $this->getDefinition('HTML', true, true); + } + + public function maybeGetRawCSSDefinition() { + return $this->getDefinition('CSS', true, true); + } + + public function maybeGetRawURIDefinition() { + return $this->getDefinition('URI', true, true); } /** @@ -544,22 +668,28 @@ class HTMLPurifier_Config */ public function finalize() { $this->finalized = true; - unset($this->parser); + $this->parser = null; } /** * Produces a nicely formatted error message by supplying the - * stack frame information from two levels up and OUTSIDE of - * HTMLPurifier_Config. + * stack frame information OUTSIDE of HTMLPurifier_Config. */ protected function triggerError($msg, $no) { // determine previous stack frame - $backtrace = debug_backtrace(); - if ($this->chatty && isset($backtrace[1])) { - $frame = $backtrace[1]; - $extra = " on line {$frame['line']} in file {$frame['file']}"; - } else { - $extra = ''; + $extra = ''; + if ($this->chatty) { + $trace = debug_backtrace(); + // zip(tail(trace), trace) -- but PHP is not Haskell har har + for ($i = 0, $c = count($trace); $i < $c - 1; $i++) { + // XXX this is not correct on some versions of HTML Purifier + if ($trace[$i + 1]['class'] === 'HTMLPurifier_Config') { + continue; + } + $frame = $trace[$i]; + $extra = " invoked on line {$frame['line']} in file {$frame['file']}"; + break; + } } trigger_error($msg . $extra, $no); } diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema.php b/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema.php index eb4c8d49..9551ba17 100644 --- a/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema.php +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema.php @@ -60,7 +60,13 @@ class HTMLPurifier_ConfigSchema { * Unserializes the default ConfigSchema. */ public static function makeFromSerial() { - return unserialize(file_get_contents(HTMLPURIFIER_PREFIX . '/HTMLPurifier/ConfigSchema/schema.ser')); + $contents = file_get_contents(HTMLPURIFIER_PREFIX . '/HTMLPurifier/ConfigSchema/schema.ser'); + $r = unserialize($contents); + if (!$r) { + $hash = sha1($contents); + trigger_error("Unserialization of configuration schema failed, sha1 of file was $hash", E_USER_ERROR); + } + return $r; } /** diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema.ser b/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema.ser index 978089c6291e7d828f6233ac632cedd8a98fdfd7..fa0bacb9476cab9e69889141969c6fefd2b4419c 100644 GIT binary patch delta 1266 zcmZq8UQjZ@Y_hDP{KnAZ%;JV-R!Yvn!FrB4Ir-(ODQ@|BCB;hCev{v+%1t(BX%jIt zgvf^!l@^zzrYKn_Ss6@zs3t%8CX1n}IZ!S!IU`jsIJGD(Yzq4KVI zAZbq*KSzjt@fu>AC$OJqGH}k%%}oWnFwx4us=Bt=$_VIwj}TuUmP5{D+)Gy zayc@I7(WVs0wLjzj~A$py0PUKUWr!HH?9!O2CL1tpnz z=}Oix`#|9VwlBD(C^NYP?sS+lLK2J8Q%jt367#Yrv+?efv9wb1FD)r3Ezxt!tnkds zODzKF25B?2o?K`owfP$FR3=%VKY(Eq8sw=5vKHbjhz~$0XfvVwiWo~Gbw&Ld6yw)%RWWT^c8MWCKO<&8+4>n3!s7 E0s6m3IsgCw diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/CSS.AllowedFonts.txt b/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/CSS.AllowedFonts.txt new file mode 100644 index 00000000..3fd46540 --- /dev/null +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/CSS.AllowedFonts.txt @@ -0,0 +1,12 @@ +CSS.AllowedFonts +TYPE: lookup/null +VERSION: 4.3.0 +DEFAULT: NULL +--DESCRIPTION-- +

+ Allows you to manually specify a set of allowed fonts. If + NULL, all fonts are allowed. This directive + affects generic names (serif, sans-serif, monospace, cursive, + fantasy) as well as specific font families. +

+--# vim: et sw=4 sts=4 diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/CSS.Trusted.txt b/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/CSS.Trusted.txt new file mode 100644 index 00000000..e733a61e --- /dev/null +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/CSS.Trusted.txt @@ -0,0 +1,9 @@ +CSS.Trusted +TYPE: bool +VERSION: 4.2.1 +DEFAULT: false +--DESCRIPTION-- +Indicates whether or not the user's CSS input is trusted or not. If the +input is trusted, a more expansive set of allowed properties. See +also %HTML.Trusted. +--# vim: et sw=4 sts=4 diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/Cache.SerializerPermissions.txt b/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/Cache.SerializerPermissions.txt new file mode 100644 index 00000000..b2b83d9a --- /dev/null +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/Cache.SerializerPermissions.txt @@ -0,0 +1,11 @@ +Cache.SerializerPermissions +TYPE: int +VERSION: 4.3.0 +DEFAULT: 0755 +--DESCRIPTION-- + +

+ Directory permissions of the files and directories created inside + the DefinitionCache/Serializer or other custom serializer path. +

+--# vim: et sw=4 sts=4 diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/Core.ColorKeywords.txt b/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/Core.ColorKeywords.txt index b8c259d1..f7823982 100644 --- a/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/Core.ColorKeywords.txt +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/Core.ColorKeywords.txt @@ -24,5 +24,6 @@ array ( --DESCRIPTION-- Lookup array of color names to six digit hexadecimal number corresponding -to color, with preceding hash mark. Used when parsing colors. +to color, with preceding hash mark. Used when parsing colors. The lookup +is done in a case-insensitive manner. --# vim: et sw=4 sts=4 diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/Core.DisableExcludes.txt b/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/Core.DisableExcludes.txt new file mode 100644 index 00000000..1cd4c2c9 --- /dev/null +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/Core.DisableExcludes.txt @@ -0,0 +1,14 @@ +Core.DisableExcludes +TYPE: bool +DEFAULT: false +VERSION: 4.5.0 +--DESCRIPTION-- +

+ This directive disables SGML-style exclusions, e.g. the exclusion of + <object> in any descendant of a + <pre> tag. Disabling excludes will allow some + invalid documents to pass through HTML Purifier, but HTML Purifier + will also be less likely to accidentally remove large documents during + processing. +

+--# vim: et sw=4 sts=4 diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/Core.EnableIDNA.txt b/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/Core.EnableIDNA.txt new file mode 100644 index 00000000..ce243c35 --- /dev/null +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/Core.EnableIDNA.txt @@ -0,0 +1,9 @@ +Core.EnableIDNA +TYPE: bool +DEFAULT: false +VERSION: 4.4.0 +--DESCRIPTION-- +Allows international domain names in URLs. This configuration option +requires the PEAR Net_IDNA2 module to be installed. It operates by +punycoding any internationalized host names for maximum portability. +--# vim: et sw=4 sts=4 diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/HTML.AllowedComments.txt b/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/HTML.AllowedComments.txt new file mode 100644 index 00000000..140e2142 --- /dev/null +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/HTML.AllowedComments.txt @@ -0,0 +1,10 @@ +HTML.AllowedComments +TYPE: lookup +VERSION: 4.4.0 +DEFAULT: array() +--DESCRIPTION-- +A whitelist which indicates what explicit comment bodies should be +allowed, modulo leading and trailing whitespace. See also %HTML.AllowedCommentsRegexp +(these directives are union'ed together, so a comment is considered +valid if any directive deems it valid.) +--# vim: et sw=4 sts=4 diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/HTML.AllowedCommentsRegexp.txt b/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/HTML.AllowedCommentsRegexp.txt new file mode 100644 index 00000000..f22e977d --- /dev/null +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/HTML.AllowedCommentsRegexp.txt @@ -0,0 +1,15 @@ +HTML.AllowedCommentsRegexp +TYPE: string/null +VERSION: 4.4.0 +DEFAULT: NULL +--DESCRIPTION-- +A regexp, which if it matches the body of a comment, indicates that +it should be allowed. Trailing and leading spaces are removed prior +to running this regular expression. +Warning: Make sure you specify +correct anchor metacharacters ^regex$, otherwise you may accept +comments that you did not mean to! In particular, the regex /foo|bar/ +is probably not sufficiently strict, since it also allows foobar. +See also %HTML.AllowedComments (these directives are union'ed together, +so a comment is considered valid if any directive deems it valid.) +--# vim: et sw=4 sts=4 diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/HTML.Nofollow.txt b/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/HTML.Nofollow.txt new file mode 100644 index 00000000..700b3092 --- /dev/null +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/HTML.Nofollow.txt @@ -0,0 +1,7 @@ +HTML.Nofollow +TYPE: bool +VERSION: 4.3.0 +DEFAULT: FALSE +--DESCRIPTION-- +If enabled, nofollow rel attributes are added to all outgoing links. +--# vim: et sw=4 sts=4 diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/HTML.SafeIframe.txt b/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/HTML.SafeIframe.txt new file mode 100644 index 00000000..5eb6ec2b --- /dev/null +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/HTML.SafeIframe.txt @@ -0,0 +1,13 @@ +HTML.SafeIframe +TYPE: bool +VERSION: 4.4.0 +DEFAULT: false +--DESCRIPTION-- +

+ Whether or not to permit iframe tags in untrusted documents. This + directive must be accompanied by a whitelist of permitted iframes, + such as %URI.SafeIframeRegexp, otherwise it will fatally error. + This directive has no effect on strict doctypes, as iframes are not + valid. +

+--# vim: et sw=4 sts=4 diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/HTML.SafeScripting.txt b/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/HTML.SafeScripting.txt new file mode 100644 index 00000000..5ebc7a19 --- /dev/null +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/HTML.SafeScripting.txt @@ -0,0 +1,10 @@ +HTML.SafeScripting +TYPE: lookup +VERSION: 4.5.0 +DEFAULT: array() +--DESCRIPTION-- +

+ Whether or not to permit script tags to external scripts in documents. + Inline scripting is not allowed, and the script must match an explicit whitelist. +

+--# vim: et sw=4 sts=4 diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/HTML.TargetBlank.txt b/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/HTML.TargetBlank.txt new file mode 100644 index 00000000..587a1677 --- /dev/null +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/HTML.TargetBlank.txt @@ -0,0 +1,8 @@ +HTML.TargetBlank +TYPE: bool +VERSION: 4.4.0 +DEFAULT: FALSE +--DESCRIPTION-- +If enabled, target=blank attributes are added to all outgoing links. +(This includes links from an HTTPS version of a page to an HTTP version.) +--# vim: et sw=4 sts=4 diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/HTML.Trusted.txt b/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/HTML.Trusted.txt index f412b256..bc8e6549 100644 --- a/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/HTML.Trusted.txt +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/HTML.Trusted.txt @@ -5,4 +5,5 @@ DEFAULT: false --DESCRIPTION-- Indicates whether or not the user input is trusted or not. If the input is trusted, a more expansive set of allowed tags and attributes will be used. +See also %CSS.Trusted. --# vim: et sw=4 sts=4 diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/Output.FixInnerHTML.txt b/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/Output.FixInnerHTML.txt new file mode 100644 index 00000000..d6f0d9f2 --- /dev/null +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/Output.FixInnerHTML.txt @@ -0,0 +1,15 @@ +Output.FixInnerHTML +TYPE: bool +VERSION: 4.3.0 +DEFAULT: true +--DESCRIPTION-- +

+ If true, HTML Purifier will protect against Internet Explorer's + mishandling of the innerHTML attribute by appending + a space to any attribute that does not contain angled brackets, spaces + or quotes, but contains a backtick. This slightly changes the + semantics of any given attribute, so if this is unacceptable and + you do not use innerHTML on any of your pages, you can + turn this directive off. +

+--# vim: et sw=4 sts=4 diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/URI.SafeIframeRegexp.txt b/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/URI.SafeIframeRegexp.txt new file mode 100644 index 00000000..79084832 --- /dev/null +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/ConfigSchema/schema/URI.SafeIframeRegexp.txt @@ -0,0 +1,22 @@ +URI.SafeIframeRegexp +TYPE: string/null +VERSION: 4.4.0 +DEFAULT: NULL +--DESCRIPTION-- +

+ A PCRE regular expression that will be matched against an iframe URI. This is + a relatively inflexible scheme, but works well enough for the most common + use-case of iframes: embedded video. This directive only has an effect if + %HTML.SafeIframe is enabled. Here are some example values: +

+
    +
  • %^http://www.youtube.com/embed/% - Allow YouTube videos
  • +
  • %^http://player.vimeo.com/video/% - Allow Vimeo videos
  • +
  • %^http://(www.youtube.com/embed/|player.vimeo.com/video/)% - Allow both
  • +
+

+ Note that this directive does not give you enough granularity to, say, disable + all autoplay videos. Pipe up on the HTML Purifier forums if this + is a capability you want. +

+--# vim: et sw=4 sts=4 diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/Definition.php b/lib/classes/htmlpurifier/library/HTMLPurifier/Definition.php index 9cd95650..8a540540 100644 --- a/lib/classes/htmlpurifier/library/HTMLPurifier/Definition.php +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/Definition.php @@ -12,6 +12,17 @@ abstract class HTMLPurifier_Definition */ public $setup = false; + /** + * If true, write out the final definition object to the cache after + * setup. This will be true only if all invocations to get a raw + * definition object are also optimized. This does not cause file + * system thrashing because on subsequent calls the cached object + * is used and any writes to the raw definition object are short + * circuited. See enduser-customize.html for the high-level + * picture. + */ + public $optimized = null; + /** * What type of definition is it? */ diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/DefinitionCache/Serializer.php b/lib/classes/htmlpurifier/library/HTMLPurifier/DefinitionCache/Serializer.php index 69feec0a..1627f3bf 100644 --- a/lib/classes/htmlpurifier/library/HTMLPurifier/DefinitionCache/Serializer.php +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/DefinitionCache/Serializer.php @@ -9,14 +9,14 @@ class HTMLPurifier_DefinitionCache_Serializer extends $file = $this->generateFilePath($config); if (file_exists($file)) return false; if (!$this->_prepareDir($config)) return false; - return $this->_write($file, serialize($def)); + return $this->_write($file, serialize($def), $config); } public function set($def, $config) { if (!$this->checkDefType($def)) return; $file = $this->generateFilePath($config); if (!$this->_prepareDir($config)) return false; - return $this->_write($file, serialize($def)); + return $this->_write($file, serialize($def), $config); } public function replace($def, $config) { @@ -24,7 +24,7 @@ class HTMLPurifier_DefinitionCache_Serializer extends $file = $this->generateFilePath($config); if (!file_exists($file)) return false; if (!$this->_prepareDir($config)) return false; - return $this->_write($file, serialize($def)); + return $this->_write($file, serialize($def), $config); } public function get($config) { @@ -97,18 +97,34 @@ class HTMLPurifier_DefinitionCache_Serializer extends * Convenience wrapper function for file_put_contents * @param $file File name to write to * @param $data Data to write into file + * @param $config Config object * @return Number of bytes written if success, or false if failure. */ - private function _write($file, $data) { - return file_put_contents($file, $data); + private function _write($file, $data, $config) { + $result = file_put_contents($file, $data); + if ($result !== false) { + // set permissions of the new file (no execute) + $chmod = $config->get('Cache.SerializerPermissions'); + if (!$chmod) { + $chmod = 0644; // invalid config or simpletest + } + $chmod = $chmod & 0666; + chmod($file, $chmod); + } + return $result; } /** * Prepares the directory that this type stores the serials in + * @param $config Config object * @return True if successful */ private function _prepareDir($config) { $directory = $this->generateDirectoryPath($config); + $chmod = $config->get('Cache.SerializerPermissions'); + if (!$chmod) { + $chmod = 0755; // invalid config or simpletest + } if (!is_dir($directory)) { $base = $this->generateBaseDirectoryPath($config); if (!is_dir($base)) { @@ -116,13 +132,13 @@ class HTMLPurifier_DefinitionCache_Serializer extends please create or change using %Cache.SerializerPath', E_USER_WARNING); return false; - } elseif (!$this->_testPermissions($base)) { + } elseif (!$this->_testPermissions($base, $chmod)) { return false; } - $old = umask(0022); // disable group and world writes - mkdir($directory); + $old = umask(0000); + mkdir($directory, $chmod); umask($old); - } elseif (!$this->_testPermissions($directory)) { + } elseif (!$this->_testPermissions($directory, $chmod)) { return false; } return true; @@ -131,8 +147,11 @@ class HTMLPurifier_DefinitionCache_Serializer extends /** * Tests permissions on a directory and throws out friendly * error messages and attempts to chmod it itself if possible + * @param $dir Directory path + * @param $chmod Permissions + * @return True if directory writable */ - private function _testPermissions($dir) { + private function _testPermissions($dir, $chmod) { // early abort, if it is writable, everything is hunky-dory if (is_writable($dir)) return true; if (!is_dir($dir)) { @@ -146,17 +165,17 @@ class HTMLPurifier_DefinitionCache_Serializer extends // POSIX system, we can give more specific advice if (fileowner($dir) === posix_getuid()) { // we can chmod it ourselves - chmod($dir, 0755); - return true; + $chmod = $chmod | 0700; + if (chmod($dir, $chmod)) return true; } elseif (filegroup($dir) === posix_getgid()) { - $chmod = '775'; + $chmod = $chmod | 0070; } else { // PHP's probably running as nobody, so we'll // need to give global permissions - $chmod = '777'; + $chmod = $chmod | 0777; } trigger_error('Directory '.$dir.' not writable, '. - 'please chmod to ' . $chmod, + 'please chmod to ' . decoct($chmod), E_USER_WARNING); } else { // generic error message diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/ElementDef.php b/lib/classes/htmlpurifier/library/HTMLPurifier/ElementDef.php index bb160549..b1049252 100644 --- a/lib/classes/htmlpurifier/library/HTMLPurifier/ElementDef.php +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/ElementDef.php @@ -30,13 +30,25 @@ class HTMLPurifier_ElementDef */ public $attr = array(); + // XXX: Design note: currently, it's not possible to override + // previously defined AttrTransforms without messing around with + // the final generated config. This is by design; a previous version + // used an associated list of attr_transform, but it was extremely + // easy to accidentally override other attribute transforms by + // forgetting to specify an index (and just using 0.) While we + // could check this by checking the index number and complaining, + // there is a second problem which is that it is not at all easy to + // tell when something is getting overridden. Combine this with a + // codebase where this isn't really being used, and it's perfect for + // nuking. + /** - * Indexed list of tag's HTMLPurifier_AttrTransform to be done before validation + * List of tags HTMLPurifier_AttrTransform to be done before validation */ public $attr_transform_pre = array(); /** - * Indexed list of tag's HTMLPurifier_AttrTransform to be done after validation + * List of tags HTMLPurifier_AttrTransform to be done after validation */ public $attr_transform_post = array(); @@ -144,9 +156,9 @@ class HTMLPurifier_ElementDef } $this->attr[$k] = $v; } - $this->_mergeAssocArray($this->attr_transform_pre, $def->attr_transform_pre); - $this->_mergeAssocArray($this->attr_transform_post, $def->attr_transform_post); $this->_mergeAssocArray($this->excludes, $def->excludes); + $this->attr_transform_pre = array_merge($this->attr_transform_pre, $def->attr_transform_pre); + $this->attr_transform_post = array_merge($this->attr_transform_post, $def->attr_transform_post); if(!empty($def->content_model)) { $this->content_model = diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/Encoder.php b/lib/classes/htmlpurifier/library/HTMLPurifier/Encoder.php index 2dbb0232..bad9c23a 100644 --- a/lib/classes/htmlpurifier/library/HTMLPurifier/Encoder.php +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/Encoder.php @@ -19,6 +19,68 @@ class HTMLPurifier_Encoder */ public static function muteErrorHandler() {} + /** + * iconv wrapper which mutes errors, but doesn't work around bugs. + */ + public static function unsafeIconv($in, $out, $text) { + set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler')); + $r = iconv($in, $out, $text); + restore_error_handler(); + return $r; + } + + /** + * iconv wrapper which mutes errors and works around bugs. + */ + public static function iconv($in, $out, $text, $max_chunk_size = 8000) { + $code = self::testIconvTruncateBug(); + if ($code == self::ICONV_OK) { + return self::unsafeIconv($in, $out, $text); + } elseif ($code == self::ICONV_TRUNCATES) { + // we can only work around this if the input character set + // is utf-8 + if ($in == 'utf-8') { + if ($max_chunk_size < 4) { + trigger_error('max_chunk_size is too small', E_USER_WARNING); + return false; + } + // split into 8000 byte chunks, but be careful to handle + // multibyte boundaries properly + if (($c = strlen($text)) <= $max_chunk_size) { + return self::unsafeIconv($in, $out, $text); + } + $r = ''; + $i = 0; + while (true) { + if ($i + $max_chunk_size >= $c) { + $r .= self::unsafeIconv($in, $out, substr($text, $i)); + break; + } + // wibble the boundary + if (0x80 != (0xC0 & ord($text[$i + $max_chunk_size]))) { + $chunk_size = $max_chunk_size; + } elseif (0x80 != (0xC0 & ord($text[$i + $max_chunk_size - 1]))) { + $chunk_size = $max_chunk_size - 1; + } elseif (0x80 != (0xC0 & ord($text[$i + $max_chunk_size - 2]))) { + $chunk_size = $max_chunk_size - 2; + } elseif (0x80 != (0xC0 & ord($text[$i + $max_chunk_size - 3]))) { + $chunk_size = $max_chunk_size - 3; + } else { + return false; // rather confusing UTF-8... + } + $chunk = substr($text, $i, $chunk_size); // substr doesn't mind overlong lengths + $r .= self::unsafeIconv($in, $out, $chunk); + $i += $chunk_size; + } + return $r; + } else { + return false; + } + } else { + return false; + } + } + /** * Cleans a UTF-8 string for well-formedness and SGML validity * @@ -260,6 +322,14 @@ class HTMLPurifier_Encoder return $ret; } + public static function iconvAvailable() { + static $iconv = null; + if ($iconv === null) { + $iconv = function_exists('iconv') && self::testIconvTruncateBug() != self::ICONV_UNUSABLE; + } + return $iconv; + } + /** * Converts a string to UTF-8 based on configuration. */ @@ -267,28 +337,30 @@ class HTMLPurifier_Encoder $encoding = $config->get('Core.Encoding'); if ($encoding === 'utf-8') return $str; static $iconv = null; - if ($iconv === null) $iconv = function_exists('iconv'); - set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler')); + if ($iconv === null) $iconv = self::iconvAvailable(); if ($iconv && !$config->get('Test.ForceNoIconv')) { - $str = iconv($encoding, 'utf-8//IGNORE', $str); + // unaffected by bugs, since UTF-8 support all characters + $str = self::unsafeIconv($encoding, 'utf-8//IGNORE', $str); if ($str === false) { // $encoding is not a valid encoding - restore_error_handler(); trigger_error('Invalid encoding ' . $encoding, E_USER_ERROR); return ''; } // If the string is bjorked by Shift_JIS or a similar encoding // that doesn't support all of ASCII, convert the naughty // characters to their true byte-wise ASCII/UTF-8 equivalents. - $str = strtr($str, HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding)); - restore_error_handler(); + $str = strtr($str, self::testEncodingSupportsASCII($encoding)); return $str; } elseif ($encoding === 'iso-8859-1') { $str = utf8_encode($str); - restore_error_handler(); return $str; } - trigger_error('Encoding not supported, please install iconv', E_USER_ERROR); + $bug = HTMLPurifier_Encoder::testIconvTruncateBug(); + if ($bug == self::ICONV_OK) { + trigger_error('Encoding not supported, please install iconv', E_USER_ERROR); + } else { + trigger_error('You have a buggy version of iconv, see https://bugs.php.net/bug.php?id=48147 and http://sourceware.org/bugzilla/show_bug.cgi?id=13541', E_USER_ERROR); + } } /** @@ -298,16 +370,15 @@ class HTMLPurifier_Encoder */ public static function convertFromUTF8($str, $config, $context) { $encoding = $config->get('Core.Encoding'); + if ($escape = $config->get('Core.EscapeNonASCIICharacters')) { + $str = self::convertToASCIIDumbLossless($str); + } if ($encoding === 'utf-8') return $str; static $iconv = null; - if ($iconv === null) $iconv = function_exists('iconv'); - if ($escape = $config->get('Core.EscapeNonASCIICharacters')) { - $str = HTMLPurifier_Encoder::convertToASCIIDumbLossless($str); - } - set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler')); + if ($iconv === null) $iconv = self::iconvAvailable(); if ($iconv && !$config->get('Test.ForceNoIconv')) { // Undo our previous fix in convertToUTF8, otherwise iconv will barf - $ascii_fix = HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding); + $ascii_fix = self::testEncodingSupportsASCII($encoding); if (!$escape && !empty($ascii_fix)) { $clear_fix = array(); foreach ($ascii_fix as $utf8 => $native) $clear_fix[$utf8] = ''; @@ -315,15 +386,17 @@ class HTMLPurifier_Encoder } $str = strtr($str, array_flip($ascii_fix)); // Normal stuff - $str = iconv('utf-8', $encoding . '//IGNORE', $str); - restore_error_handler(); + $str = self::iconv('utf-8', $encoding . '//IGNORE', $str); return $str; } elseif ($encoding === 'iso-8859-1') { $str = utf8_decode($str); - restore_error_handler(); return $str; } trigger_error('Encoding not supported', E_USER_ERROR); + // You might be tempted to assume that the ASCII representation + // might be OK, however, this is *not* universally true over all + // encodings. So we take the conservative route here, rather + // than forcibly turn on %Core.EscapeNonASCIICharacters } /** @@ -373,6 +446,49 @@ class HTMLPurifier_Encoder return $result; } + /** No bugs detected in iconv. */ + const ICONV_OK = 0; + + /** Iconv truncates output if converting from UTF-8 to another + * character set with //IGNORE, and a non-encodable character is found */ + const ICONV_TRUNCATES = 1; + + /** Iconv does not support //IGNORE, making it unusable for + * transcoding purposes */ + const ICONV_UNUSABLE = 2; + + /** + * glibc iconv has a known bug where it doesn't handle the magic + * //IGNORE stanza correctly. In particular, rather than ignore + * characters, it will return an EILSEQ after consuming some number + * of characters, and expect you to restart iconv as if it were + * an E2BIG. Old versions of PHP did not respect the errno, and + * returned the fragment, so as a result you would see iconv + * mysteriously truncating output. We can work around this by + * manually chopping our input into segments of about 8000 + * characters, as long as PHP ignores the error code. If PHP starts + * paying attention to the error code, iconv becomes unusable. + * + * @returns Error code indicating severity of bug. + */ + public static function testIconvTruncateBug() { + static $code = null; + if ($code === null) { + // better not use iconv, otherwise infinite loop! + $r = self::unsafeIconv('utf-8', 'ascii//IGNORE', "\xCE\xB1" . str_repeat('a', 9000)); + if ($r === false) { + $code = self::ICONV_UNUSABLE; + } elseif (($c = strlen($r)) < 9000) { + $code = self::ICONV_TRUNCATES; + } elseif ($c > 9000) { + trigger_error('Your copy of iconv is extremely buggy. Please notify HTML Purifier maintainers: include your iconv version as per phpversion()', E_USER_ERROR); + } else { + $code = self::ICONV_OK; + } + } + return $code; + } + /** * This expensive function tests whether or not a given character * encoding supports ASCII. 7/8-bit encodings like Shift_JIS will @@ -385,6 +501,11 @@ class HTMLPurifier_Encoder * which can be used to "undo" any overzealous iconv action. */ public static function testEncodingSupportsASCII($encoding, $bypass = false) { + // All calls to iconv here are unsafe, proof by case analysis: + // If ICONV_OK, no difference. + // If ICONV_TRUNCATE, all calls involve one character inputs, + // so bug is not triggered. + // If ICONV_UNUSABLE, this call is irrelevant static $encodings = array(); if (!$bypass) { if (isset($encodings[$encoding])) return $encodings[$encoding]; @@ -398,24 +519,22 @@ class HTMLPurifier_Encoder if (strpos($lenc, 'iso-8859-') === 0) return array(); } $ret = array(); - set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler')); - if (iconv('UTF-8', $encoding, 'a') === false) return false; + if (self::unsafeIconv('UTF-8', $encoding, 'a') === false) return false; for ($i = 0x20; $i <= 0x7E; $i++) { // all printable ASCII chars $c = chr($i); // UTF-8 char - $r = iconv('UTF-8', "$encoding//IGNORE", $c); // initial conversion + $r = self::unsafeIconv('UTF-8', "$encoding//IGNORE", $c); // initial conversion if ( $r === '' || // This line is needed for iconv implementations that do not // omit characters that do not exist in the target character set - ($r === $c && iconv($encoding, 'UTF-8//IGNORE', $r) !== $c) + ($r === $c && self::unsafeIconv($encoding, 'UTF-8//IGNORE', $r) !== $c) ) { // Reverse engineer: what's the UTF-8 equiv of this byte // sequence? This assumes that there's no variable width // encoding that doesn't support ASCII. - $ret[iconv($encoding, 'UTF-8//IGNORE', $c)] = $c; + $ret[self::unsafeIconv($encoding, 'UTF-8//IGNORE', $c)] = $c; } } - restore_error_handler(); $encodings[$encoding] = $ret; return $ret; } diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/EntityLookup/entities.ser b/lib/classes/htmlpurifier/library/HTMLPurifier/EntityLookup/entities.ser index f2b8b8f2..e8b08128 100644 --- a/lib/classes/htmlpurifier/library/HTMLPurifier/EntityLookup/entities.ser +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/EntityLookup/entities.ser @@ -1 +1 @@ -a:246:{s:4:"nbsp";s:2:" ";s:5:"iexcl";s:2:"¡";s:4:"cent";s:2:"¢";s:5:"pound";s:2:"£";s:6:"curren";s:2:"¤";s:3:"yen";s:2:"¥";s:6:"brvbar";s:2:"¦";s:4:"sect";s:2:"§";s:3:"uml";s:2:"¨";s:4:"copy";s:2:"©";s:4:"ordf";s:2:"ª";s:5:"laquo";s:2:"«";s:3:"not";s:2:"¬";s:3:"shy";s:2:"­";s:3:"reg";s:2:"®";s:4:"macr";s:2:"¯";s:3:"deg";s:2:"°";s:6:"plusmn";s:2:"±";s:5:"acute";s:2:"´";s:5:"micro";s:2:"µ";s:4:"para";s:2:"¶";s:6:"middot";s:2:"·";s:5:"cedil";s:2:"¸";s:4:"ordm";s:2:"º";s:5:"raquo";s:2:"»";s:6:"iquest";s:2:"¿";s:6:"Agrave";s:2:"À";s:6:"Aacute";s:2:"Á";s:5:"Acirc";s:2:"Â";s:6:"Atilde";s:2:"Ã";s:4:"Auml";s:2:"Ä";s:5:"Aring";s:2:"Å";s:5:"AElig";s:2:"Æ";s:6:"Ccedil";s:2:"Ç";s:6:"Egrave";s:2:"È";s:6:"Eacute";s:2:"É";s:5:"Ecirc";s:2:"Ê";s:4:"Euml";s:2:"Ë";s:6:"Igrave";s:2:"Ì";s:6:"Iacute";s:2:"Í";s:5:"Icirc";s:2:"Î";s:4:"Iuml";s:2:"Ï";s:3:"ETH";s:2:"Ð";s:6:"Ntilde";s:2:"Ñ";s:6:"Ograve";s:2:"Ò";s:6:"Oacute";s:2:"Ó";s:5:"Ocirc";s:2:"Ô";s:6:"Otilde";s:2:"Õ";s:4:"Ouml";s:2:"Ö";s:5:"times";s:2:"×";s:6:"Oslash";s:2:"Ø";s:6:"Ugrave";s:2:"Ù";s:6:"Uacute";s:2:"Ú";s:5:"Ucirc";s:2:"Û";s:4:"Uuml";s:2:"Ü";s:6:"Yacute";s:2:"Ý";s:5:"THORN";s:2:"Þ";s:5:"szlig";s:2:"ß";s:6:"agrave";s:2:"à";s:6:"aacute";s:2:"á";s:5:"acirc";s:2:"â";s:6:"atilde";s:2:"ã";s:4:"auml";s:2:"ä";s:5:"aring";s:2:"å";s:5:"aelig";s:2:"æ";s:6:"ccedil";s:2:"ç";s:6:"egrave";s:2:"è";s:6:"eacute";s:2:"é";s:5:"ecirc";s:2:"ê";s:4:"euml";s:2:"ë";s:6:"igrave";s:2:"ì";s:6:"iacute";s:2:"í";s:5:"icirc";s:2:"î";s:4:"iuml";s:2:"ï";s:3:"eth";s:2:"ð";s:6:"ntilde";s:2:"ñ";s:6:"ograve";s:2:"ò";s:6:"oacute";s:2:"ó";s:5:"ocirc";s:2:"ô";s:6:"otilde";s:2:"õ";s:4:"ouml";s:2:"ö";s:6:"divide";s:2:"÷";s:6:"oslash";s:2:"ø";s:6:"ugrave";s:2:"ù";s:6:"uacute";s:2:"ú";s:5:"ucirc";s:2:"û";s:4:"uuml";s:2:"ü";s:6:"yacute";s:2:"ý";s:5:"thorn";s:2:"þ";s:4:"yuml";s:2:"ÿ";s:4:"quot";s:1:""";s:3:"amp";s:1:"&";s:2:"lt";s:1:"<";s:2:"gt";s:1:">";s:4:"apos";s:1:"'";s:5:"OElig";s:2:"Œ";s:5:"oelig";s:2:"œ";s:6:"Scaron";s:2:"Š";s:6:"scaron";s:2:"š";s:4:"Yuml";s:2:"Ÿ";s:4:"circ";s:2:"ˆ";s:5:"tilde";s:2:"˜";s:4:"ensp";s:3:" ";s:4:"emsp";s:3:" ";s:6:"thinsp";s:3:" ";s:4:"zwnj";s:3:"‌";s:3:"zwj";s:3:"‍";s:3:"lrm";s:3:"‎";s:3:"rlm";s:3:"‏";s:5:"ndash";s:3:"–";s:5:"mdash";s:3:"—";s:5:"lsquo";s:3:"‘";s:5:"rsquo";s:3:"’";s:5:"sbquo";s:3:"‚";s:5:"ldquo";s:3:"“";s:5:"rdquo";s:3:"”";s:5:"bdquo";s:3:"„";s:6:"dagger";s:3:"†";s:6:"Dagger";s:3:"‡";s:6:"permil";s:3:"‰";s:6:"lsaquo";s:3:"‹";s:6:"rsaquo";s:3:"›";s:4:"euro";s:3:"€";s:4:"fnof";s:2:"ƒ";s:5:"Alpha";s:2:"Α";s:4:"Beta";s:2:"Β";s:5:"Gamma";s:2:"Γ";s:5:"Delta";s:2:"Δ";s:7:"Epsilon";s:2:"Ε";s:4:"Zeta";s:2:"Ζ";s:3:"Eta";s:2:"Η";s:5:"Theta";s:2:"Θ";s:4:"Iota";s:2:"Ι";s:5:"Kappa";s:2:"Κ";s:6:"Lambda";s:2:"Λ";s:2:"Mu";s:2:"Μ";s:2:"Nu";s:2:"Ν";s:2:"Xi";s:2:"Ξ";s:7:"Omicron";s:2:"Ο";s:2:"Pi";s:2:"Π";s:3:"Rho";s:2:"Ρ";s:5:"Sigma";s:2:"Σ";s:3:"Tau";s:2:"Τ";s:7:"Upsilon";s:2:"Υ";s:3:"Phi";s:2:"Φ";s:3:"Chi";s:2:"Χ";s:3:"Psi";s:2:"Ψ";s:5:"Omega";s:2:"Ω";s:5:"alpha";s:2:"α";s:4:"beta";s:2:"β";s:5:"gamma";s:2:"γ";s:5:"delta";s:2:"δ";s:7:"epsilon";s:2:"ε";s:4:"zeta";s:2:"ζ";s:3:"eta";s:2:"η";s:5:"theta";s:2:"θ";s:4:"iota";s:2:"ι";s:5:"kappa";s:2:"κ";s:6:"lambda";s:2:"λ";s:2:"mu";s:2:"μ";s:2:"nu";s:2:"ν";s:2:"xi";s:2:"ξ";s:7:"omicron";s:2:"ο";s:2:"pi";s:2:"π";s:3:"rho";s:2:"ρ";s:6:"sigmaf";s:2:"ς";s:5:"sigma";s:2:"σ";s:3:"tau";s:2:"τ";s:7:"upsilon";s:2:"υ";s:3:"phi";s:2:"φ";s:3:"chi";s:2:"χ";s:3:"psi";s:2:"ψ";s:5:"omega";s:2:"ω";s:8:"thetasym";s:2:"ϑ";s:5:"upsih";s:2:"ϒ";s:3:"piv";s:2:"ϖ";s:4:"bull";s:3:"•";s:6:"hellip";s:3:"…";s:5:"prime";s:3:"′";s:5:"Prime";s:3:"″";s:5:"oline";s:3:"‾";s:5:"frasl";s:3:"⁄";s:6:"weierp";s:3:"℘";s:5:"image";s:3:"ℑ";s:4:"real";s:3:"ℜ";s:5:"trade";s:3:"™";s:7:"alefsym";s:3:"ℵ";s:4:"larr";s:3:"←";s:4:"uarr";s:3:"↑";s:4:"rarr";s:3:"→";s:4:"darr";s:3:"↓";s:4:"harr";s:3:"↔";s:5:"crarr";s:3:"↵";s:4:"lArr";s:3:"⇐";s:4:"uArr";s:3:"⇑";s:4:"rArr";s:3:"⇒";s:4:"dArr";s:3:"⇓";s:4:"hArr";s:3:"⇔";s:6:"forall";s:3:"∀";s:4:"part";s:3:"∂";s:5:"exist";s:3:"∃";s:5:"empty";s:3:"∅";s:5:"nabla";s:3:"∇";s:4:"isin";s:3:"∈";s:5:"notin";s:3:"∉";s:2:"ni";s:3:"∋";s:4:"prod";s:3:"∏";s:3:"sum";s:3:"∑";s:5:"minus";s:3:"−";s:6:"lowast";s:3:"∗";s:5:"radic";s:3:"√";s:4:"prop";s:3:"∝";s:5:"infin";s:3:"∞";s:3:"ang";s:3:"∠";s:3:"and";s:3:"∧";s:2:"or";s:3:"∨";s:3:"cap";s:3:"∩";s:3:"cup";s:3:"∪";s:3:"int";s:3:"∫";s:3:"sim";s:3:"∼";s:4:"cong";s:3:"≅";s:5:"asymp";s:3:"≈";s:2:"ne";s:3:"≠";s:5:"equiv";s:3:"≡";s:2:"le";s:3:"≤";s:2:"ge";s:3:"≥";s:3:"sub";s:3:"⊂";s:3:"sup";s:3:"⊃";s:4:"nsub";s:3:"⊄";s:4:"sube";s:3:"⊆";s:4:"supe";s:3:"⊇";s:5:"oplus";s:3:"⊕";s:6:"otimes";s:3:"⊗";s:4:"perp";s:3:"⊥";s:4:"sdot";s:3:"⋅";s:5:"lceil";s:3:"⌈";s:5:"rceil";s:3:"⌉";s:6:"lfloor";s:3:"⌊";s:6:"rfloor";s:3:"⌋";s:4:"lang";s:3:"〈";s:4:"rang";s:3:"〉";s:3:"loz";s:3:"◊";s:6:"spades";s:3:"♠";s:5:"clubs";s:3:"♣";s:6:"hearts";s:3:"♥";s:5:"diams";s:3:"♦";} \ No newline at end of file +a:253:{s:4:"fnof";s:2:"ƒ";s:5:"Alpha";s:2:"Α";s:4:"Beta";s:2:"Β";s:5:"Gamma";s:2:"Γ";s:5:"Delta";s:2:"Δ";s:7:"Epsilon";s:2:"Ε";s:4:"Zeta";s:2:"Ζ";s:3:"Eta";s:2:"Η";s:5:"Theta";s:2:"Θ";s:4:"Iota";s:2:"Ι";s:5:"Kappa";s:2:"Κ";s:6:"Lambda";s:2:"Λ";s:2:"Mu";s:2:"Μ";s:2:"Nu";s:2:"Ν";s:2:"Xi";s:2:"Ξ";s:7:"Omicron";s:2:"Ο";s:2:"Pi";s:2:"Π";s:3:"Rho";s:2:"Ρ";s:5:"Sigma";s:2:"Σ";s:3:"Tau";s:2:"Τ";s:7:"Upsilon";s:2:"Υ";s:3:"Phi";s:2:"Φ";s:3:"Chi";s:2:"Χ";s:3:"Psi";s:2:"Ψ";s:5:"Omega";s:2:"Ω";s:5:"alpha";s:2:"α";s:4:"beta";s:2:"β";s:5:"gamma";s:2:"γ";s:5:"delta";s:2:"δ";s:7:"epsilon";s:2:"ε";s:4:"zeta";s:2:"ζ";s:3:"eta";s:2:"η";s:5:"theta";s:2:"θ";s:4:"iota";s:2:"ι";s:5:"kappa";s:2:"κ";s:6:"lambda";s:2:"λ";s:2:"mu";s:2:"μ";s:2:"nu";s:2:"ν";s:2:"xi";s:2:"ξ";s:7:"omicron";s:2:"ο";s:2:"pi";s:2:"π";s:3:"rho";s:2:"ρ";s:6:"sigmaf";s:2:"ς";s:5:"sigma";s:2:"σ";s:3:"tau";s:2:"τ";s:7:"upsilon";s:2:"υ";s:3:"phi";s:2:"φ";s:3:"chi";s:2:"χ";s:3:"psi";s:2:"ψ";s:5:"omega";s:2:"ω";s:8:"thetasym";s:2:"ϑ";s:5:"upsih";s:2:"ϒ";s:3:"piv";s:2:"ϖ";s:4:"bull";s:3:"•";s:6:"hellip";s:3:"…";s:5:"prime";s:3:"′";s:5:"Prime";s:3:"″";s:5:"oline";s:3:"‾";s:5:"frasl";s:3:"⁄";s:6:"weierp";s:3:"℘";s:5:"image";s:3:"ℑ";s:4:"real";s:3:"ℜ";s:5:"trade";s:3:"™";s:7:"alefsym";s:3:"ℵ";s:4:"larr";s:3:"←";s:4:"uarr";s:3:"↑";s:4:"rarr";s:3:"→";s:4:"darr";s:3:"↓";s:4:"harr";s:3:"↔";s:5:"crarr";s:3:"↵";s:4:"lArr";s:3:"⇐";s:4:"uArr";s:3:"⇑";s:4:"rArr";s:3:"⇒";s:4:"dArr";s:3:"⇓";s:4:"hArr";s:3:"⇔";s:6:"forall";s:3:"∀";s:4:"part";s:3:"∂";s:5:"exist";s:3:"∃";s:5:"empty";s:3:"∅";s:5:"nabla";s:3:"∇";s:4:"isin";s:3:"∈";s:5:"notin";s:3:"∉";s:2:"ni";s:3:"∋";s:4:"prod";s:3:"∏";s:3:"sum";s:3:"∑";s:5:"minus";s:3:"−";s:6:"lowast";s:3:"∗";s:5:"radic";s:3:"√";s:4:"prop";s:3:"∝";s:5:"infin";s:3:"∞";s:3:"ang";s:3:"∠";s:3:"and";s:3:"∧";s:2:"or";s:3:"∨";s:3:"cap";s:3:"∩";s:3:"cup";s:3:"∪";s:3:"int";s:3:"∫";s:6:"there4";s:3:"∴";s:3:"sim";s:3:"∼";s:4:"cong";s:3:"≅";s:5:"asymp";s:3:"≈";s:2:"ne";s:3:"≠";s:5:"equiv";s:3:"≡";s:2:"le";s:3:"≤";s:2:"ge";s:3:"≥";s:3:"sub";s:3:"⊂";s:3:"sup";s:3:"⊃";s:4:"nsub";s:3:"⊄";s:4:"sube";s:3:"⊆";s:4:"supe";s:3:"⊇";s:5:"oplus";s:3:"⊕";s:6:"otimes";s:3:"⊗";s:4:"perp";s:3:"⊥";s:4:"sdot";s:3:"⋅";s:5:"lceil";s:3:"⌈";s:5:"rceil";s:3:"⌉";s:6:"lfloor";s:3:"⌊";s:6:"rfloor";s:3:"⌋";s:4:"lang";s:3:"〈";s:4:"rang";s:3:"〉";s:3:"loz";s:3:"◊";s:6:"spades";s:3:"♠";s:5:"clubs";s:3:"♣";s:6:"hearts";s:3:"♥";s:5:"diams";s:3:"♦";s:4:"quot";s:1:""";s:3:"amp";s:1:"&";s:2:"lt";s:1:"<";s:2:"gt";s:1:">";s:4:"apos";s:1:"'";s:5:"OElig";s:2:"Œ";s:5:"oelig";s:2:"œ";s:6:"Scaron";s:2:"Š";s:6:"scaron";s:2:"š";s:4:"Yuml";s:2:"Ÿ";s:4:"circ";s:2:"ˆ";s:5:"tilde";s:2:"˜";s:4:"ensp";s:3:" ";s:4:"emsp";s:3:" ";s:6:"thinsp";s:3:" ";s:4:"zwnj";s:3:"‌";s:3:"zwj";s:3:"‍";s:3:"lrm";s:3:"‎";s:3:"rlm";s:3:"‏";s:5:"ndash";s:3:"–";s:5:"mdash";s:3:"—";s:5:"lsquo";s:3:"‘";s:5:"rsquo";s:3:"’";s:5:"sbquo";s:3:"‚";s:5:"ldquo";s:3:"“";s:5:"rdquo";s:3:"”";s:5:"bdquo";s:3:"„";s:6:"dagger";s:3:"†";s:6:"Dagger";s:3:"‡";s:6:"permil";s:3:"‰";s:6:"lsaquo";s:3:"‹";s:6:"rsaquo";s:3:"›";s:4:"euro";s:3:"€";s:4:"nbsp";s:2:" ";s:5:"iexcl";s:2:"¡";s:4:"cent";s:2:"¢";s:5:"pound";s:2:"£";s:6:"curren";s:2:"¤";s:3:"yen";s:2:"¥";s:6:"brvbar";s:2:"¦";s:4:"sect";s:2:"§";s:3:"uml";s:2:"¨";s:4:"copy";s:2:"©";s:4:"ordf";s:2:"ª";s:5:"laquo";s:2:"«";s:3:"not";s:2:"¬";s:3:"shy";s:2:"­";s:3:"reg";s:2:"®";s:4:"macr";s:2:"¯";s:3:"deg";s:2:"°";s:6:"plusmn";s:2:"±";s:4:"sup2";s:2:"²";s:4:"sup3";s:2:"³";s:5:"acute";s:2:"´";s:5:"micro";s:2:"µ";s:4:"para";s:2:"¶";s:6:"middot";s:2:"·";s:5:"cedil";s:2:"¸";s:4:"sup1";s:2:"¹";s:4:"ordm";s:2:"º";s:5:"raquo";s:2:"»";s:6:"frac14";s:2:"¼";s:6:"frac12";s:2:"½";s:6:"frac34";s:2:"¾";s:6:"iquest";s:2:"¿";s:6:"Agrave";s:2:"À";s:6:"Aacute";s:2:"Á";s:5:"Acirc";s:2:"Â";s:6:"Atilde";s:2:"Ã";s:4:"Auml";s:2:"Ä";s:5:"Aring";s:2:"Å";s:5:"AElig";s:2:"Æ";s:6:"Ccedil";s:2:"Ç";s:6:"Egrave";s:2:"È";s:6:"Eacute";s:2:"É";s:5:"Ecirc";s:2:"Ê";s:4:"Euml";s:2:"Ë";s:6:"Igrave";s:2:"Ì";s:6:"Iacute";s:2:"Í";s:5:"Icirc";s:2:"Î";s:4:"Iuml";s:2:"Ï";s:3:"ETH";s:2:"Ð";s:6:"Ntilde";s:2:"Ñ";s:6:"Ograve";s:2:"Ò";s:6:"Oacute";s:2:"Ó";s:5:"Ocirc";s:2:"Ô";s:6:"Otilde";s:2:"Õ";s:4:"Ouml";s:2:"Ö";s:5:"times";s:2:"×";s:6:"Oslash";s:2:"Ø";s:6:"Ugrave";s:2:"Ù";s:6:"Uacute";s:2:"Ú";s:5:"Ucirc";s:2:"Û";s:4:"Uuml";s:2:"Ü";s:6:"Yacute";s:2:"Ý";s:5:"THORN";s:2:"Þ";s:5:"szlig";s:2:"ß";s:6:"agrave";s:2:"à";s:6:"aacute";s:2:"á";s:5:"acirc";s:2:"â";s:6:"atilde";s:2:"ã";s:4:"auml";s:2:"ä";s:5:"aring";s:2:"å";s:5:"aelig";s:2:"æ";s:6:"ccedil";s:2:"ç";s:6:"egrave";s:2:"è";s:6:"eacute";s:2:"é";s:5:"ecirc";s:2:"ê";s:4:"euml";s:2:"ë";s:6:"igrave";s:2:"ì";s:6:"iacute";s:2:"í";s:5:"icirc";s:2:"î";s:4:"iuml";s:2:"ï";s:3:"eth";s:2:"ð";s:6:"ntilde";s:2:"ñ";s:6:"ograve";s:2:"ò";s:6:"oacute";s:2:"ó";s:5:"ocirc";s:2:"ô";s:6:"otilde";s:2:"õ";s:4:"ouml";s:2:"ö";s:6:"divide";s:2:"÷";s:6:"oslash";s:2:"ø";s:6:"ugrave";s:2:"ù";s:6:"uacute";s:2:"ú";s:5:"ucirc";s:2:"û";s:4:"uuml";s:2:"ü";s:6:"yacute";s:2:"ý";s:5:"thorn";s:2:"þ";s:4:"yuml";s:2:"ÿ";} \ No newline at end of file diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/Filter/ExtractStyleBlocks.php b/lib/classes/htmlpurifier/library/HTMLPurifier/Filter/ExtractStyleBlocks.php index c64e4612..aeb25df7 100644 --- a/lib/classes/htmlpurifier/library/HTMLPurifier/Filter/ExtractStyleBlocks.php +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/Filter/ExtractStyleBlocks.php @@ -1,5 +1,11 @@ blocks from input HTML, cleans them up * using CSSTidy, and then places them in $purifier->context->get('StyleBlocks') @@ -21,8 +27,16 @@ class HTMLPurifier_Filter_ExtractStyleBlocks extends HTMLPurifier_Filter private $_styleMatches = array(); private $_tidy; + private $_id_attrdef; + private $_class_attrdef; + private $_enum_attrdef; + public function __construct() { $this->_tidy = new csstidy(); + $this->_tidy->set_cfg('lowercase_s', false); + $this->_id_attrdef = new HTMLPurifier_AttrDef_HTML_ID(true); + $this->_class_attrdef = new HTMLPurifier_AttrDef_CSS_Ident(); + $this->_enum_attrdef = new HTMLPurifier_AttrDef_Enum(array('first-child', 'link', 'visited', 'active', 'hover', 'focus')); } /** @@ -77,27 +91,166 @@ class HTMLPurifier_Filter_ExtractStyleBlocks extends HTMLPurifier_Filter $css = substr($css, 0, -3); } $css = trim($css); + set_error_handler('htmlpurifier_filter_extractstyleblocks_muteerrorhandler'); $this->_tidy->parse($css); + restore_error_handler(); $css_definition = $config->getDefinition('CSS'); + $html_definition = $config->getDefinition('HTML'); + $new_css = array(); foreach ($this->_tidy->css as $k => $decls) { // $decls are all CSS declarations inside an @ selector $new_decls = array(); foreach ($decls as $selector => $style) { $selector = trim($selector); if ($selector === '') continue; // should not happen - if ($selector[0] === '+') { - if ($selector !== '' && $selector[0] === '+') continue; - } - if (!empty($scopes)) { - $new_selector = array(); // because multiple ones are possible - $selectors = array_map('trim', explode(',', $selector)); - foreach ($scopes as $s1) { - foreach ($selectors as $s2) { - $new_selector[] = "$s1 $s2"; + // Parse the selector + // Here is the relevant part of the CSS grammar: + // + // ruleset + // : selector [ ',' S* selector ]* '{' ... + // selector + // : simple_selector [ combinator selector | S+ [ combinator? selector ]? ]? + // combinator + // : '+' S* + // : '>' S* + // simple_selector + // : element_name [ HASH | class | attrib | pseudo ]* + // | [ HASH | class | attrib | pseudo ]+ + // element_name + // : IDENT | '*' + // ; + // class + // : '.' IDENT + // ; + // attrib + // : '[' S* IDENT S* [ [ '=' | INCLUDES | DASHMATCH ] S* + // [ IDENT | STRING ] S* ]? ']' + // ; + // pseudo + // : ':' [ IDENT | FUNCTION S* [IDENT S*]? ')' ] + // ; + // + // For reference, here are the relevant tokens: + // + // HASH #{name} + // IDENT {ident} + // INCLUDES == + // DASHMATCH |= + // STRING {string} + // FUNCTION {ident}\( + // + // And the lexical scanner tokens + // + // name {nmchar}+ + // nmchar [_a-z0-9-]|{nonascii}|{escape} + // nonascii [\240-\377] + // escape {unicode}|\\[^\r\n\f0-9a-f] + // unicode \\{h}}{1,6}(\r\n|[ \t\r\n\f])? + // ident -?{nmstart}{nmchar*} + // nmstart [_a-z]|{nonascii}|{escape} + // string {string1}|{string2} + // string1 \"([^\n\r\f\\"]|\\{nl}|{escape})*\" + // string2 \'([^\n\r\f\\"]|\\{nl}|{escape})*\' + // + // We'll implement a subset (in order to reduce attack + // surface); in particular: + // + // - No Unicode support + // - No escapes support + // - No string support (by proxy no attrib support) + // - element_name is matched against allowed + // elements (some people might find this + // annoying...) + // - Pseudo-elements one of :first-child, :link, + // :visited, :active, :hover, :focus + + // handle ruleset + $selectors = array_map('trim', explode(',', $selector)); + $new_selectors = array(); + foreach ($selectors as $sel) { + // split on +, > and spaces + $basic_selectors = preg_split('/\s*([+> ])\s*/', $sel, -1, PREG_SPLIT_DELIM_CAPTURE); + // even indices are chunks, odd indices are + // delimiters + $nsel = null; + $delim = null; // guaranteed to be non-null after + // two loop iterations + for ($i = 0, $c = count($basic_selectors); $i < $c; $i++) { + $x = $basic_selectors[$i]; + if ($i % 2) { + // delimiter + if ($x === ' ') { + $delim = ' '; + } else { + $delim = ' ' . $x . ' '; + } + } else { + // simple selector + $components = preg_split('/([#.:])/', $x, -1, PREG_SPLIT_DELIM_CAPTURE); + $sdelim = null; + $nx = null; + for ($j = 0, $cc = count($components); $j < $cc; $j ++) { + $y = $components[$j]; + if ($j === 0) { + if ($y === '*' || isset($html_definition->info[$y = strtolower($y)])) { + $nx = $y; + } else { + // $nx stays null; this matters + // if we don't manage to find + // any valid selector content, + // in which case we ignore the + // outer $delim + } + } elseif ($j % 2) { + // set delimiter + $sdelim = $y; + } else { + $attrdef = null; + if ($sdelim === '#') { + $attrdef = $this->_id_attrdef; + } elseif ($sdelim === '.') { + $attrdef = $this->_class_attrdef; + } elseif ($sdelim === ':') { + $attrdef = $this->_enum_attrdef; + } else { + throw new HTMLPurifier_Exception('broken invariant sdelim and preg_split'); + } + $r = $attrdef->validate($y, $config, $context); + if ($r !== false) { + if ($r !== true) { + $y = $r; + } + if ($nx === null) { + $nx = ''; + } + $nx .= $sdelim . $y; + } + } + } + if ($nx !== null) { + if ($nsel === null) { + $nsel = $nx; + } else { + $nsel .= $delim . $nx; + } + } else { + // delimiters to the left of invalid + // basic selector ignored + } + } + } + if ($nsel !== null) { + if (!empty($scopes)) { + foreach ($scopes as $s) { + $new_selectors[] = "$s $nsel"; + } + } else { + $new_selectors[] = $nsel; } } - $selector = implode(', ', $new_selector); // now it's a string } + if (empty($new_selectors)) continue; + $selector = implode(', ', $new_selectors); foreach ($style as $name => $value) { if (!isset($css_definition->info[$name])) { unset($style[$name]); @@ -110,10 +263,11 @@ class HTMLPurifier_Filter_ExtractStyleBlocks extends HTMLPurifier_Filter } $new_decls[$selector] = $style; } - $this->_tidy->css[$k] = $new_decls; + $new_css[$k] = $new_decls; } // remove stuff that shouldn't be used, could be reenabled // after security risks are analyzed + $this->_tidy->css = $new_css; $this->_tidy->import = array(); $this->_tidy->charset = null; $this->_tidy->namespace = null; diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/Generator.php b/lib/classes/htmlpurifier/library/HTMLPurifier/Generator.php index 27e231b1..3a75b872 100644 --- a/lib/classes/htmlpurifier/library/HTMLPurifier/Generator.php +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/Generator.php @@ -36,6 +36,11 @@ class HTMLPurifier_Generator */ private $_flashCompat; + /** + * Cache of %Output.FixInnerHTML + */ + private $_innerHTMLFix; + /** * Stack for keeping track of object information when outputting IE * compatibility code. @@ -54,6 +59,7 @@ class HTMLPurifier_Generator public function __construct($config, $context) { $this->config = $config; $this->_scriptFix = $config->get('Output.CommentScriptContents'); + $this->_innerHTMLFix = $config->get('Output.FixInnerHTML'); $this->_sortAttr = $config->get('Output.SortAttr'); $this->_flashCompat = $config->get('Output.FlashCompat'); $this->_def = $config->getHTMLDefinition(); @@ -132,19 +138,7 @@ class HTMLPurifier_Generator $_extra = ''; if ($this->_flashCompat) { if ($token->name == "object" && !empty($this->_flashStack)) { - $flash = array_pop($this->_flashStack); - $compat_token = new HTMLPurifier_Token_Empty("embed"); - foreach ($flash->attr as $name => $val) { - if ($name == "classid") continue; - if ($name == "type") continue; - if ($name == "data") $name = "src"; - $compat_token->attr[$name] = $val; - } - foreach ($flash->param as $name => $val) { - if ($name == "movie") $name = "src"; - $compat_token->attr[$name] = $val; - } - $_extra = ""; + // doesn't do anything for now } } return $_extra . 'name . '>'; @@ -202,6 +196,37 @@ class HTMLPurifier_Generator continue; } } + // Workaround for Internet Explorer innerHTML bug. + // Essentially, Internet Explorer, when calculating + // innerHTML, omits quotes if there are no instances of + // angled brackets, quotes or spaces. However, when parsing + // HTML (for example, when you assign to innerHTML), it + // treats backticks as quotes. Thus, + // `` + // becomes + // `` + // becomes + // + // Fortunately, all we need to do is trigger an appropriate + // quoting style, which we do by adding an extra space. + // This also is consistent with the W3C spec, which states + // that user agents may ignore leading or trailing + // whitespace (in fact, most don't, at least for attributes + // like alt, but an extra space at the end is barely + // noticeable). Still, we have a configuration knob for + // this, since this transformation is not necesary if you + // don't process user input with innerHTML or you don't plan + // on supporting Internet Explorer. + if ($this->_innerHTMLFix) { + if (strpos($value, '`') !== false) { + // check if correct quoting style would not already be + // triggered + if (strcspn($value, '"\' <>') === strlen($value)) { + // protect! + $value .= ' '; + } + } + } $html .= $key.'="'.$this->escape($value).'" '; } return rtrim($html); diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLDefinition.php b/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLDefinition.php index 2454c9c0..b548703a 100644 --- a/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLDefinition.php +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLDefinition.php @@ -147,7 +147,7 @@ class HTMLPurifier_HTMLDefinition extends HTMLPurifier_Definition return $this->_anonModule; } - private $_anonModule; + private $_anonModule = null; // PUBLIC BUT INTERNAL VARIABLES -------------------------------------- diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/Bdo.php b/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/Bdo.php index 6d767ca6..6c5c8aad 100644 --- a/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/Bdo.php +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/Bdo.php @@ -21,7 +21,7 @@ class HTMLPurifier_HTMLModule_Bdo extends HTMLPurifier_HTMLModule // inclusions wrong for bdo: bdo allows Lang ) ); - $bdo->attr_transform_post['required-dir'] = new HTMLPurifier_AttrTransform_BdoDir(); + $bdo->attr_transform_post[] = new HTMLPurifier_AttrTransform_BdoDir(); $this->attr_collections['I18N']['dir'] = 'Enum#ltr,rtl'; } diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/Forms.php b/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/Forms.php index f3629ab9..89701a99 100644 --- a/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/Forms.php +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/Forms.php @@ -35,7 +35,7 @@ class HTMLPurifier_HTMLModule_Forms extends HTMLPurifier_HTMLModule 'name' => 'CDATA', 'readonly' => 'Bool#readonly', 'size' => 'Number', - 'src' => 'URI#embeds', + 'src' => 'URI#embedded', 'tabindex' => 'Number', 'type' => 'Enum#text,password,checkbox,button,radio,submit,reset,file,hidden,image', 'value' => 'CDATA', @@ -84,7 +84,8 @@ class HTMLPurifier_HTMLModule_Forms extends HTMLPurifier_HTMLModule $button->excludes = $this->makeLookup( 'form', 'fieldset', // Form 'input', 'select', 'textarea', 'label', 'button', // Formctrl - 'a' // as per HTML 4.01 spec, this is omitted by modularization + 'a', // as per HTML 4.01 spec, this is omitted by modularization + 'isindex', 'iframe' // legacy items ); // Extra exclusion: img usemap="" is not permitted within this element. diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/Iframe.php b/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/Iframe.php new file mode 100644 index 00000000..287071ed --- /dev/null +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/Iframe.php @@ -0,0 +1,38 @@ +get('HTML.SafeIframe')) { + $this->safe = true; + } + $this->addElement( + 'iframe', 'Inline', 'Flow', 'Common', + array( + 'src' => 'URI#embedded', + 'width' => 'Length', + 'height' => 'Length', + 'name' => 'ID', + 'scrolling' => 'Enum#yes,no,auto', + 'frameborder' => 'Enum#0,1', + 'longdesc' => 'URI', + 'marginheight' => 'Pixels', + 'marginwidth' => 'Pixels', + ) + ); + } + +} + +// vim: et sw=4 sts=4 diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/Legacy.php b/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/Legacy.php index a7e85945..f466775a 100644 --- a/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/Legacy.php +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/Legacy.php @@ -89,7 +89,7 @@ class HTMLPurifier_HTMLModule_Legacy extends HTMLPurifier_HTMLModule $hr->attr['width'] = 'Length'; $img = $this->addBlankElement('img'); - $img->attr['align'] = 'Enum#top,middle,bottom,left,right'; + $img->attr['align'] = 'IAlign'; $img->attr['border'] = 'Pixels'; $img->attr['hspace'] = 'Pixels'; $img->attr['vspace'] = 'Pixels'; @@ -136,6 +136,22 @@ class HTMLPurifier_HTMLModule_Legacy extends HTMLPurifier_HTMLModule $ul->attr['compact'] = 'Bool#compact'; $ul->attr['type'] = 'Enum#square,disc,circle'; + // "safe" modifications to "unsafe" elements + // WARNING: If you want to add support for an unsafe, legacy + // attribute, make a new TrustedLegacy module with the trusted + // bit set appropriately + + $form = $this->addBlankElement('form'); + $form->content_model = 'Flow | #PCDATA'; + $form->content_model_type = 'optional'; + $form->attr['target'] = 'FrameTarget'; + + $input = $this->addBlankElement('input'); + $input->attr['align'] = 'IAlign'; + + $legend = $this->addBlankElement('legend'); + $legend->attr['align'] = 'LAlign'; + } } diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/List.php b/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/List.php index 57aad7b3..380b635d 100644 --- a/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/List.php +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/List.php @@ -20,10 +20,16 @@ class HTMLPurifier_HTMLModule_List extends HTMLPurifier_HTMLModule public $content_sets = array('Flow' => 'List'); public function setup($config) { - $ol = $this->addElement('ol', 'List', 'Required: li', 'Common'); - $ol->wrap = "li"; - $ul = $this->addElement('ul', 'List', 'Required: li', 'Common'); - $ul->wrap = "li"; + $ol = $this->addElement('ol', 'List', new HTMLPurifier_ChildDef_List(), 'Common'); + $ul = $this->addElement('ul', 'List', new HTMLPurifier_ChildDef_List(), 'Common'); + // XXX The wrap attribute is handled by MakeWellFormed. This is all + // quite unsatisfactory, because we generated this + // *specifically* for lists, and now a big chunk of the handling + // is done properly by the List ChildDef. So actually, we just + // want enough information to make autoclosing work properly, + // and then hand off the tricky stuff to the ChildDef. + $ol->wrap = 'li'; + $ul->wrap = 'li'; $this->addElement('dl', 'List', 'Required: dt | dd', 'Common'); $this->addElement('li', false, 'Flow', 'Common'); diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/Name.php b/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/Name.php index 31300f36..bf797faa 100644 --- a/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/Name.php +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/Name.php @@ -11,7 +11,7 @@ class HTMLPurifier_HTMLModule_Name extends HTMLPurifier_HTMLModule $element = $this->addBlankElement($name); $element->attr['name'] = 'CDATA'; if (!$config->get('HTML.Attr.Name.UseCDATA')) { - $element->attr_transform_post['NameSync'] = new HTMLPurifier_AttrTransform_NameSync(); + $element->attr_transform_post[] = new HTMLPurifier_AttrTransform_NameSync(); } } } diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/Nofollow.php b/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/Nofollow.php new file mode 100644 index 00000000..3aa6654a --- /dev/null +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/Nofollow.php @@ -0,0 +1,19 @@ +addBlankElement('a'); + $a->attr_transform_post[] = new HTMLPurifier_AttrTransform_Nofollow(); + } + +} + +// vim: et sw=4 sts=4 diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/SafeEmbed.php b/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/SafeEmbed.php index 1a4b3b03..b054c0c0 100644 --- a/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/SafeEmbed.php +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/SafeEmbed.php @@ -21,7 +21,7 @@ class HTMLPurifier_HTMLModule_SafeEmbed extends HTMLPurifier_HTMLModule 'allowscriptaccess' => 'Enum#never', 'allownetworking' => 'Enum#internal', 'flashvars' => 'Text', - 'wmode' => 'Enum#window', + 'wmode' => 'Enum#window,transparent,opaque', 'name' => 'ID', ) ); diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/SafeObject.php b/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/SafeObject.php index f190f087..acbfa7f7 100644 --- a/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/SafeObject.php +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/SafeObject.php @@ -29,7 +29,6 @@ class HTMLPurifier_HTMLModule_SafeObject extends HTMLPurifier_HTMLModule 'width' => 'Pixels#' . $max, 'height' => 'Pixels#' . $max, 'data' => 'URI#embedded', - 'classid' => 'Enum#clsid:d27cdb6e-ae6d-11cf-96b8-444553540000', 'codebase' => new HTMLPurifier_AttrDef_Enum(array( 'http://download.macromedia.com/pub/shockwave/cabs/flash/swflash.cab#version=6,0,40,0')), ) diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/SafeScripting.php b/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/SafeScripting.php new file mode 100644 index 00000000..e32a6b6c --- /dev/null +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/SafeScripting.php @@ -0,0 +1,37 @@ +get('HTML.SafeScripting'); + $script = $this->addElement( + 'script', + 'Inline', + 'Empty', + null, + array( + // While technically not required by the spec, we're forcing + // it to this value. + 'type' => 'Enum#text/javascript', + 'src*' => new HTMLPurifier_AttrDef_Enum(array_keys($allowed)) + ) + ); + $script->attr_transform_pre[] = + $script->attr_transform_post[] = new HTMLPurifier_AttrTransform_ScriptRequired(); + + } + +} + +// vim: et sw=4 sts=4 diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/Scripting.php b/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/Scripting.php index 9c95d467..42e8f9e5 100644 --- a/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/Scripting.php +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/Scripting.php @@ -45,8 +45,8 @@ class HTMLPurifier_HTMLModule_Scripting extends HTMLPurifier_HTMLModule ); $this->info['script']->content_model = '#PCDATA'; $this->info['script']->content_model_type = 'optional'; - $this->info['script']->attr_transform_pre['type'] = - $this->info['script']->attr_transform_post['type'] = + $this->info['script']->attr_transform_pre[] = + $this->info['script']->attr_transform_post[] = new HTMLPurifier_AttrTransform_ScriptRequired(); } } diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/Tables.php b/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/Tables.php index 951b8224..50e5c93c 100644 --- a/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/Tables.php +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/Tables.php @@ -37,6 +37,9 @@ class HTMLPurifier_HTMLModule_Tables extends HTMLPurifier_HTMLModule 'abbr' => 'Text', 'colspan' => 'Number', 'rowspan' => 'Number', + // Apparently, as of HTML5 this attribute only applies + // to 'th' elements. + 'scope' => 'Enum#row,col,rowgroup,colgroup', ), $cell_align ); diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/TargetBlank.php b/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/TargetBlank.php new file mode 100644 index 00000000..e1305ec5 --- /dev/null +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModule/TargetBlank.php @@ -0,0 +1,19 @@ +addBlankElement('a'); + $a->attr_transform_post[] = new HTMLPurifier_AttrTransform_TargetBlank(); + } + +} + +// vim: et sw=4 sts=4 diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModuleManager.php b/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModuleManager.php index 43af050f..73efe007 100644 --- a/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModuleManager.php +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/HTMLModuleManager.php @@ -65,11 +65,11 @@ class HTMLPurifier_HTMLModuleManager 'Presentation', 'Edit', 'Bdo', 'Tables', 'Image', 'StyleAttribute', // Unsafe: - 'Scripting', 'Object', 'Forms', + 'Scripting', 'Object', 'Forms', // Sorta legacy, but present in strict: 'Name', ); - $transitional = array('Legacy', 'Target'); + $transitional = array('Legacy', 'Target', 'Iframe'); $xml = array('XMLCommonAttributes'); $non_xml = array('NonXMLCommonAttributes'); @@ -112,7 +112,9 @@ class HTMLPurifier_HTMLModuleManager $this->doctypes->register( 'XHTML 1.1', true, - array_merge($common, $xml, array('Ruby')), + // Iframe is a real XHTML 1.1 module, despite being + // "transitional"! + array_merge($common, $xml, array('Ruby', 'Iframe')), array('Tidy_Strict', 'Tidy_XHTML', 'Tidy_Proprietary', 'Tidy_Strict', 'Tidy_Name'), // Tidy_XHTML1_1 array(), '-//W3C//DTD XHTML 1.1//EN', @@ -216,19 +218,25 @@ class HTMLPurifier_HTMLModuleManager } } - // add proprietary module (this gets special treatment because - // it is completely removed from doctypes, etc.) + // custom modules if ($config->get('HTML.Proprietary')) { $modules[] = 'Proprietary'; } - - // add SafeObject/Safeembed modules if ($config->get('HTML.SafeObject')) { $modules[] = 'SafeObject'; } if ($config->get('HTML.SafeEmbed')) { $modules[] = 'SafeEmbed'; } + if ($config->get('HTML.SafeScripting') !== array()) { + $modules[] = 'SafeScripting'; + } + if ($config->get('HTML.Nofollow')) { + $modules[] = 'Nofollow'; + } + if ($config->get('HTML.TargetBlank')) { + $modules[] = 'TargetBlank'; + } // merge in custom modules $modules = array_merge($modules, $this->userModules); @@ -364,6 +372,13 @@ class HTMLPurifier_HTMLModuleManager // :TODO: // non-standalone definitions that don't have a standalone // to merge into could be deferred to the end + // HOWEVER, it is perfectly valid for a non-standalone + // definition to lack a standalone definition, even + // after all processing: this allows us to safely + // specify extra attributes for elements that may not be + // enabled all in one place. In particular, this might + // be the case for trusted elements. WARNING: care must + // be taken that the /extra/ definitions are all safe. continue; } diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/Injector/RemoveEmpty.php b/lib/classes/htmlpurifier/library/HTMLPurifier/Injector/RemoveEmpty.php index f1d3f4fa..48f6b9fe 100644 --- a/lib/classes/htmlpurifier/library/HTMLPurifier/Injector/RemoveEmpty.php +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/Injector/RemoveEmpty.php @@ -5,6 +5,9 @@ class HTMLPurifier_Injector_RemoveEmpty extends HTMLPurifier_Injector private $context, $config, $attrValidator, $removeNbsp, $removeNbspExceptions; + // TODO: make me configurable + private $_exclude = array('colgroup' => 1, 'th' => 1, 'td' => 1, 'iframe' => 1); + public function prepare($config, $context) { parent::prepare($config, $context); $this->config = $config; @@ -30,7 +33,7 @@ class HTMLPurifier_Injector_RemoveEmpty extends HTMLPurifier_Injector break; } if (!$next || ($next instanceof HTMLPurifier_Token_End && $next->name == $token->name)) { - if ($token->name == 'colgroup') return; + if (isset($this->_exclude[$token->name])) return; $this->attrValidator->validateToken($token, $this->config, $this->context); $token->armor['ValidateAttributes'] = true; if (isset($token->attr['id']) || isset($token->attr['name'])) return; diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/Lexer.php b/lib/classes/htmlpurifier/library/HTMLPurifier/Lexer.php index 61e065f3..4607cf09 100644 --- a/lib/classes/htmlpurifier/library/HTMLPurifier/Lexer.php +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/Lexer.php @@ -235,7 +235,7 @@ class HTMLPurifier_Lexer */ protected static function removeIEConditional($string) { return preg_replace( - '##si', // probably should generalize for all strings + '##si', // probably should generalize for all strings '', $string ); @@ -273,11 +273,11 @@ class HTMLPurifier_Lexer $html = $this->escapeCommentedCDATA($html); } - $html = $this->removeIEConditional($html); - // escape CDATA $html = $this->escapeCDATA($html); + $html = $this->removeIEConditional($html); + // extract body from document if applicable if ($config->get('Core.ConvertDocumentToFragment')) { $e = false; diff --git a/lib/classes/htmlpurifier/library/HTMLPurifier/Lexer/DOMLex.php b/lib/classes/htmlpurifier/library/HTMLPurifier/Lexer/DOMLex.php index 8a2442ef..0913297d 100644 --- a/lib/classes/htmlpurifier/library/HTMLPurifier/Lexer/DOMLex.php +++ b/lib/classes/htmlpurifier/library/HTMLPurifier/Lexer/DOMLex.php @@ -72,23 +72,57 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer } /** - * Recursive function that tokenizes a node, putting it into an accumulator. - * + * Iterative function that tokenizes a node, putting it into an accumulator. + * To iterate is human, to recurse divine - L. Peter Deutsch * @param $node DOMNode to be tokenized. * @param $tokens Array-list of already tokenized tokens. - * @param $collect Says whether or start and close are collected, set to - * false at first recursion because it's the implicit DIV - * tag you're dealing with. * @returns Tokens of node appended to previously passed tokens. */ - protected function tokenizeDOM($node, &$tokens, $collect = false) { + protected function tokenizeDOM($node, &$tokens) { + $level = 0; + $nodes = array($level => array($node)); + $closingNodes = array(); + do { + while (!empty($nodes[$level])) { + $node = array_shift($nodes[$level]); // FIFO + $collect = $level > 0 ? true : false; + $needEndingTag = $this->createStartNode($node, $tokens, $collect); + if ($needEndingTag) { + $closingNodes[$level][] = $node; + } + if ($node->childNodes && $node->childNodes->length) { + $level++; + $nodes[$level] = array(); + foreach ($node->childNodes as $childNode) { + array_push($nodes[$level], $childNode); + } + } + } + $level--; + if ($level && isset($closingNodes[$level])) { + while($node = array_pop($closingNodes[$level])) { + $this->createEndNode($node, $tokens); + } + } + } while ($level > 0); + } + + /** + * @param $node DOMNode to be tokenized. + * @param $tokens Array-list of already tokenized tokens. + * @param $collect Says whether or start and close are collected, set to + * false at first recursion because it's the implicit DIV + * tag you're dealing with. + * @returns bool if the token needs an endtoken + */ + protected function createStartNode($node, &$tokens, $collect) { // intercept non element nodes. WE MUST catch all of them, // but we're not getting the character reference nodes because // those should have been preprocessed if ($node->nodeType === XML_TEXT_NODE) { $tokens[] = $this->factory->createText($node->data); - return; + return false; } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) { // undo libxml's special treatment of