Update HTMLPurifier to current stable version 4.5.0
This commit is contained in:
@@ -32,7 +32,7 @@ class HTMLPurifier_AttrDef_CSS_Background extends HTMLPurifier_AttrDef
|
||||
$string = $this->mungeRgb($string);
|
||||
|
||||
// assumes URI doesn't have spaces in it
|
||||
$bits = explode(' ', strtolower($string)); // bits to process
|
||||
$bits = explode(' ', $string); // bits to process
|
||||
|
||||
$caught = array();
|
||||
$caught['color'] = false;
|
||||
|
||||
@@ -2,11 +2,43 @@
|
||||
|
||||
/**
|
||||
* Validates a font family list according to CSS spec
|
||||
* @todo whitelisting allowed fonts would be nice
|
||||
*/
|
||||
class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
protected $mask = null;
|
||||
|
||||
public function __construct() {
|
||||
$this->mask = '_- ';
|
||||
for ($c = 'a'; $c <= 'z'; $c++) $this->mask .= $c;
|
||||
for ($c = 'A'; $c <= 'Z'; $c++) $this->mask .= $c;
|
||||
for ($c = '0'; $c <= '9'; $c++) $this->mask .= $c; // cast-y, but should be fine
|
||||
// special bytes used by UTF-8
|
||||
for ($i = 0x80; $i <= 0xFF; $i++) {
|
||||
// We don't bother excluding invalid bytes in this range,
|
||||
// because the our restriction of well-formed UTF-8 will
|
||||
// prevent these from ever occurring.
|
||||
$this->mask .= chr($i);
|
||||
}
|
||||
|
||||
/*
|
||||
PHP's internal strcspn implementation is
|
||||
O(length of string * length of mask), making it inefficient
|
||||
for large masks. However, it's still faster than
|
||||
preg_match 8)
|
||||
for (p = s1;;) {
|
||||
spanp = s2;
|
||||
do {
|
||||
if (*spanp == c || p == s1_end) {
|
||||
return p - s1;
|
||||
}
|
||||
} while (spanp++ < (s2_end - 1));
|
||||
c = *++p;
|
||||
}
|
||||
*/
|
||||
// possible optimization: invert the mask.
|
||||
}
|
||||
|
||||
public function validate($string, $config, $context) {
|
||||
static $generic_names = array(
|
||||
'serif' => true,
|
||||
@@ -15,6 +47,7 @@ class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
|
||||
'fantasy' => true,
|
||||
'cursive' => true
|
||||
);
|
||||
$allowed_fonts = $config->get('CSS.AllowedFonts');
|
||||
|
||||
// assume that no font names contain commas in them
|
||||
$fonts = explode(',', $string);
|
||||
@@ -24,7 +57,9 @@ class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
|
||||
if ($font === '') continue;
|
||||
// match a generic name
|
||||
if (isset($generic_names[$font])) {
|
||||
$final .= $font . ', ';
|
||||
if ($allowed_fonts === null || isset($allowed_fonts[$font])) {
|
||||
$final .= $font . ', ';
|
||||
}
|
||||
continue;
|
||||
}
|
||||
// match a quoted name
|
||||
@@ -40,6 +75,10 @@ class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
|
||||
|
||||
// $font is a pure representation of the font name
|
||||
|
||||
if ($allowed_fonts !== null && !isset($allowed_fonts[$font])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ctype_alnum($font) && $font !== '') {
|
||||
// very simple font, allow it in unharmed
|
||||
$final .= $font . ', ';
|
||||
@@ -50,17 +89,103 @@ class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
|
||||
// shouldn't show up regardless
|
||||
$font = str_replace(array("\n", "\t", "\r", "\x0C"), ' ', $font);
|
||||
|
||||
// These ugly transforms don't pose a security
|
||||
// risk (as \\ and \" might). We could try to be clever and
|
||||
// use single-quote wrapping when there is a double quote
|
||||
// present, but I have choosen not to implement that.
|
||||
// (warning: this code relies on the selection of quotation
|
||||
// mark below)
|
||||
$font = str_replace('\\', '\\5C ', $font);
|
||||
$font = str_replace('"', '\\22 ', $font);
|
||||
// Here, there are various classes of characters which need
|
||||
// to be treated differently:
|
||||
// - Alphanumeric characters are essentially safe. We
|
||||
// handled these above.
|
||||
// - Spaces require quoting, though most parsers will do
|
||||
// the right thing if there aren't any characters that
|
||||
// can be misinterpreted
|
||||
// - Dashes rarely occur, but they fairly unproblematic
|
||||
// for parsing/rendering purposes.
|
||||
// The above characters cover the majority of Western font
|
||||
// names.
|
||||
// - Arbitrary Unicode characters not in ASCII. Because
|
||||
// most parsers give little thought to Unicode, treatment
|
||||
// of these codepoints is basically uniform, even for
|
||||
// punctuation-like codepoints. These characters can
|
||||
// show up in non-Western pages and are supported by most
|
||||
// major browsers, for example: "MS 明朝" is a
|
||||
// legitimate font-name
|
||||
// <http://ja.wikipedia.org/wiki/MS_明朝>. See
|
||||
// the CSS3 spec for more examples:
|
||||
// <http://www.w3.org/TR/2011/WD-css3-fonts-20110324/localizedfamilynames.png>
|
||||
// You can see live samples of these on the Internet:
|
||||
// <http://www.google.co.jp/search?q=font-family+MS+明朝|ゴシック>
|
||||
// However, most of these fonts have ASCII equivalents:
|
||||
// for example, 'MS Mincho', and it's considered
|
||||
// professional to use ASCII font names instead of
|
||||
// Unicode font names. Thanks Takeshi Terada for
|
||||
// providing this information.
|
||||
// The following characters, to my knowledge, have not been
|
||||
// used to name font names.
|
||||
// - Single quote. While theoretically you might find a
|
||||
// font name that has a single quote in its name (serving
|
||||
// as an apostrophe, e.g. Dave's Scribble), I haven't
|
||||
// been able to find any actual examples of this.
|
||||
// Internet Explorer's cssText translation (which I
|
||||
// believe is invoked by innerHTML) normalizes any
|
||||
// quoting to single quotes, and fails to escape single
|
||||
// quotes. (Note that this is not IE's behavior for all
|
||||
// CSS properties, just some sort of special casing for
|
||||
// font-family). So a single quote *cannot* be used
|
||||
// safely in the font-family context if there will be an
|
||||
// innerHTML/cssText translation. Note that Firefox 3.x
|
||||
// does this too.
|
||||
// - Double quote. In IE, these get normalized to
|
||||
// single-quotes, no matter what the encoding. (Fun
|
||||
// fact, in IE8, the 'content' CSS property gained
|
||||
// support, where they special cased to preserve encoded
|
||||
// double quotes, but still translate unadorned double
|
||||
// quotes into single quotes.) So, because their
|
||||
// fixpoint behavior is identical to single quotes, they
|
||||
// cannot be allowed either. Firefox 3.x displays
|
||||
// single-quote style behavior.
|
||||
// - Backslashes are reduced by one (so \\ -> \) every
|
||||
// iteration, so they cannot be used safely. This shows
|
||||
// up in IE7, IE8 and FF3
|
||||
// - Semicolons, commas and backticks are handled properly.
|
||||
// - The rest of the ASCII punctuation is handled properly.
|
||||
// We haven't checked what browsers do to unadorned
|
||||
// versions, but this is not important as long as the
|
||||
// browser doesn't /remove/ surrounding quotes (as IE does
|
||||
// for HTML).
|
||||
//
|
||||
// With these results in hand, we conclude that there are
|
||||
// various levels of safety:
|
||||
// - Paranoid: alphanumeric, spaces and dashes(?)
|
||||
// - International: Paranoid + non-ASCII Unicode
|
||||
// - Edgy: Everything except quotes, backslashes
|
||||
// - NoJS: Standards compliance, e.g. sod IE. Note that
|
||||
// with some judicious character escaping (since certain
|
||||
// types of escaping doesn't work) this is theoretically
|
||||
// OK as long as innerHTML/cssText is not called.
|
||||
// We believe that international is a reasonable default
|
||||
// (that we will implement now), and once we do more
|
||||
// extensive research, we may feel comfortable with dropping
|
||||
// it down to edgy.
|
||||
|
||||
// complicated font, requires quoting
|
||||
$final .= "\"$font\", "; // note that this will later get turned into "
|
||||
// Edgy: alphanumeric, spaces, dashes, underscores and Unicode. Use of
|
||||
// str(c)spn assumes that the string was already well formed
|
||||
// Unicode (which of course it is).
|
||||
if (strspn($font, $this->mask) !== strlen($font)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Historical:
|
||||
// In the absence of innerHTML/cssText, these ugly
|
||||
// transforms don't pose a security risk (as \\ and \"
|
||||
// might--these escapes are not supported by most browsers).
|
||||
// We could try to be clever and use single-quote wrapping
|
||||
// when there is a double quote present, but I have choosen
|
||||
// not to implement that. (NOTE: you can reduce the amount
|
||||
// of escapes by one depending on what quoting style you use)
|
||||
// $font = str_replace('\\', '\\5C ', $font);
|
||||
// $font = str_replace('"', '\\22 ', $font);
|
||||
// $font = str_replace("'", '\\27 ', $font);
|
||||
|
||||
// font possibly with spaces, requires quoting
|
||||
$final .= "'$font', ";
|
||||
}
|
||||
$final = rtrim($final, ', ');
|
||||
if ($final === '') return false;
|
||||
|
||||
@@ -0,0 +1,24 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* Validates based on {ident} CSS grammar production
|
||||
*/
|
||||
class HTMLPurifier_AttrDef_CSS_Ident extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
public function validate($string, $config, $context) {
|
||||
|
||||
$string = trim($string);
|
||||
|
||||
// early abort: '' and '0' (strings that convert to false) are invalid
|
||||
if (!$string) return false;
|
||||
|
||||
$pattern = '/^(-?[A-Za-z_][A-Za-z_\-0-9]*)$/';
|
||||
if (!preg_match($pattern, $string)) return false;
|
||||
return $string;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// vim: et sw=4 sts=4
|
||||
@@ -43,6 +43,15 @@ class HTMLPurifier_AttrDef_CSS_URI extends HTMLPurifier_AttrDef_URI
|
||||
// extra sanity check; should have been done by URI
|
||||
$result = str_replace(array('"', "\\", "\n", "\x0c", "\r"), "", $result);
|
||||
|
||||
// suspicious characters are ()'; we're going to percent encode
|
||||
// them for safety.
|
||||
$result = str_replace(array('(', ')', "'"), array('%28', '%29', '%27'), $result);
|
||||
|
||||
// there's an extra bug where ampersands lose their escaping on
|
||||
// an innerHTML cycle, so a very unlucky query parameter could
|
||||
// then change the meaning of the URL. Unfortunately, there's
|
||||
// not much we can do about that...
|
||||
|
||||
return "url(\"$result\")";
|
||||
|
||||
}
|
||||
|
||||
@@ -0,0 +1,28 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* Dummy AttrDef that mimics another AttrDef, BUT it generates clones
|
||||
* with make.
|
||||
*/
|
||||
class HTMLPurifier_AttrDef_Clone extends HTMLPurifier_AttrDef
|
||||
{
|
||||
/**
|
||||
* What we're cloning
|
||||
*/
|
||||
protected $clone;
|
||||
|
||||
public function __construct($clone) {
|
||||
$this->clone = $clone;
|
||||
}
|
||||
|
||||
public function validate($v, $config, $context) {
|
||||
return $this->clone->validate($v, $config, $context);
|
||||
}
|
||||
|
||||
public function make($string) {
|
||||
return clone $this->clone;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// vim: et sw=4 sts=4
|
||||
@@ -14,7 +14,8 @@ class HTMLPurifier_AttrDef_HTML_Color extends HTMLPurifier_AttrDef
|
||||
$string = trim($string);
|
||||
|
||||
if (empty($string)) return false;
|
||||
if (isset($colors[$string])) return $colors[$string];
|
||||
$lower = strtolower($string);
|
||||
if (isset($colors[$lower])) return $colors[$lower];
|
||||
if ($string[0] === '#') $hex = substr($string, 1);
|
||||
else $hex = $string;
|
||||
|
||||
|
||||
@@ -12,12 +12,22 @@
|
||||
class HTMLPurifier_AttrDef_HTML_ID extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
// ref functionality disabled, since we also have to verify
|
||||
// whether or not the ID it refers to exists
|
||||
// selector is NOT a valid thing to use for IDREFs, because IDREFs
|
||||
// *must* target IDs that exist, whereas selector #ids do not.
|
||||
|
||||
/**
|
||||
* Determines whether or not we're validating an ID in a CSS
|
||||
* selector context.
|
||||
*/
|
||||
protected $selector;
|
||||
|
||||
public function __construct($selector = false) {
|
||||
$this->selector = $selector;
|
||||
}
|
||||
|
||||
public function validate($id, $config, $context) {
|
||||
|
||||
if (!$config->get('Attr.EnableID')) return false;
|
||||
if (!$this->selector && !$config->get('Attr.EnableID')) return false;
|
||||
|
||||
$id = trim($id); // trim it first
|
||||
|
||||
@@ -33,10 +43,10 @@ class HTMLPurifier_AttrDef_HTML_ID extends HTMLPurifier_AttrDef
|
||||
'%Attr.IDPrefix is set', E_USER_WARNING);
|
||||
}
|
||||
|
||||
//if (!$this->ref) {
|
||||
if (!$this->selector) {
|
||||
$id_accumulator =& $context->get('IDAccumulator');
|
||||
if (isset($id_accumulator->ids[$id])) return false;
|
||||
//}
|
||||
}
|
||||
|
||||
// we purposely avoid using regex, hopefully this is faster
|
||||
|
||||
@@ -56,7 +66,7 @@ class HTMLPurifier_AttrDef_HTML_ID extends HTMLPurifier_AttrDef
|
||||
return false;
|
||||
}
|
||||
|
||||
if (/*!$this->ref && */$result) $id_accumulator->add($id);
|
||||
if (!$this->selector && $result) $id_accumulator->add($id);
|
||||
|
||||
// if no change was made to the ID, return the result
|
||||
// else, return the new id if stripping whitespace made it
|
||||
|
||||
@@ -19,7 +19,7 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
|
||||
}
|
||||
|
||||
public function make($string) {
|
||||
$embeds = (bool) $string;
|
||||
$embeds = ($string === 'embedded');
|
||||
return new HTMLPurifier_AttrDef_URI($embeds);
|
||||
}
|
||||
|
||||
|
||||
@@ -23,6 +23,12 @@ class HTMLPurifier_AttrDef_URI_Host extends HTMLPurifier_AttrDef
|
||||
|
||||
public function validate($string, $config, $context) {
|
||||
$length = strlen($string);
|
||||
// empty hostname is OK; it's usually semantically equivalent:
|
||||
// the default host as defined by a URI scheme is used:
|
||||
//
|
||||
// If the URI scheme defines a default for host, then that
|
||||
// default applies when the host subcomponent is undefined
|
||||
// or when the registered name is empty (zero length).
|
||||
if ($string === '') return '';
|
||||
if ($length > 1 && $string[0] === '[' && $string[$length-1] === ']') {
|
||||
//IPv6
|
||||
@@ -38,9 +44,8 @@ class HTMLPurifier_AttrDef_URI_Host extends HTMLPurifier_AttrDef
|
||||
|
||||
// A regular domain name.
|
||||
|
||||
// This breaks I18N domain names, but we don't have proper IRI support,
|
||||
// so force users to insert Punycode. If there's complaining we'll
|
||||
// try to fix things into an international friendly form.
|
||||
// This doesn't match I18N domain names, but we don't have proper IRI support,
|
||||
// so force users to insert Punycode.
|
||||
|
||||
// The productions describing this are:
|
||||
$a = '[a-z]'; // alpha
|
||||
@@ -51,10 +56,44 @@ class HTMLPurifier_AttrDef_URI_Host extends HTMLPurifier_AttrDef
|
||||
// toplabel = alpha | alpha *( alphanum | "-" ) alphanum
|
||||
$toplabel = "$a($and*$an)?";
|
||||
// hostname = *( domainlabel "." ) toplabel [ "." ]
|
||||
$match = preg_match("/^($domainlabel\.)*$toplabel\.?$/i", $string);
|
||||
if (!$match) return false;
|
||||
if (preg_match("/^($domainlabel\.)*$toplabel\.?$/i", $string)) {
|
||||
return $string;
|
||||
}
|
||||
|
||||
return $string;
|
||||
// If we have Net_IDNA2 support, we can support IRIs by
|
||||
// punycoding them. (This is the most portable thing to do,
|
||||
// since otherwise we have to assume browsers support
|
||||
|
||||
if ($config->get('Core.EnableIDNA')) {
|
||||
$idna = new Net_IDNA2(array('encoding' => 'utf8', 'overlong' => false, 'strict' => true));
|
||||
// we need to encode each period separately
|
||||
$parts = explode('.', $string);
|
||||
try {
|
||||
$new_parts = array();
|
||||
foreach ($parts as $part) {
|
||||
$encodable = false;
|
||||
for ($i = 0, $c = strlen($part); $i < $c; $i++) {
|
||||
if (ord($part[$i]) > 0x7a) {
|
||||
$encodable = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!$encodable) {
|
||||
$new_parts[] = $part;
|
||||
} else {
|
||||
$new_parts[] = $idna->encode($part);
|
||||
}
|
||||
}
|
||||
$string = implode('.', $new_parts);
|
||||
if (preg_match("/^($domainlabel\.)*$toplabel\.?$/i", $string)) {
|
||||
return $string;
|
||||
}
|
||||
} catch (Exception $e) {
|
||||
// XXX error reporting
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user