FontFamily.php 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217
  1. <?php
  2. /**
  3. * Validates a font family list according to CSS spec
  4. */
  5. class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
  6. {
  7. protected $mask = null;
  8. public function __construct()
  9. {
  10. // Lowercase letters
  11. $l = range('a', 'z');
  12. // Uppercase letters
  13. $u = range('A', 'Z');
  14. // Digits
  15. $d = range('0', '9');
  16. // Special bytes used by UTF-8
  17. $b = array_map('chr', range(0x80, 0xFF));
  18. // All valid characters for the mask
  19. $c = array_merge($l, $u, $d, $b);
  20. // Concatenate all valid characters into a string
  21. // Use '_- ' as an initial value
  22. $this->mask = array_reduce($c, function ($carry, $value) {
  23. return $carry . $value;
  24. }, '_- ');
  25. /*
  26. PHP's internal strcspn implementation is
  27. O(length of string * length of mask), making it inefficient
  28. for large masks. However, it's still faster than
  29. preg_match 8)
  30. for (p = s1;;) {
  31. spanp = s2;
  32. do {
  33. if (*spanp == c || p == s1_end) {
  34. return p - s1;
  35. }
  36. } while (spanp++ < (s2_end - 1));
  37. c = *++p;
  38. }
  39. */
  40. // possible optimization: invert the mask.
  41. }
  42. /**
  43. * @param string $string
  44. * @param HTMLPurifier_Config $config
  45. * @param HTMLPurifier_Context $context
  46. * @return bool|string
  47. */
  48. public function validate($string, $config, $context)
  49. {
  50. static $generic_names = array(
  51. 'serif' => true,
  52. 'sans-serif' => true,
  53. 'monospace' => true,
  54. 'fantasy' => true,
  55. 'cursive' => true
  56. );
  57. $allowed_fonts = $config->get('CSS.AllowedFonts');
  58. // assume that no font names contain commas in them
  59. $fonts = explode(',', $string);
  60. $final = '';
  61. foreach ($fonts as $font) {
  62. $font = trim($font);
  63. if ($font === '') {
  64. continue;
  65. }
  66. // match a generic name
  67. if (isset($generic_names[$font])) {
  68. if ($allowed_fonts === null || isset($allowed_fonts[$font])) {
  69. $final .= $font . ', ';
  70. }
  71. continue;
  72. }
  73. // match a quoted name
  74. if ($font[0] === '"' || $font[0] === "'") {
  75. $length = strlen($font);
  76. if ($length <= 2) {
  77. continue;
  78. }
  79. $quote = $font[0];
  80. if ($font[$length - 1] !== $quote) {
  81. continue;
  82. }
  83. $font = substr($font, 1, $length - 2);
  84. }
  85. $font = $this->expandCSSEscape($font);
  86. // $font is a pure representation of the font name
  87. if ($allowed_fonts !== null && !isset($allowed_fonts[$font])) {
  88. continue;
  89. }
  90. if (ctype_alnum($font) && $font !== '') {
  91. // very simple font, allow it in unharmed
  92. $final .= $font . ', ';
  93. continue;
  94. }
  95. // bugger out on whitespace. form feed (0C) really
  96. // shouldn't show up regardless
  97. $font = str_replace(array("\n", "\t", "\r", "\x0C"), ' ', $font);
  98. // Here, there are various classes of characters which need
  99. // to be treated differently:
  100. // - Alphanumeric characters are essentially safe. We
  101. // handled these above.
  102. // - Spaces require quoting, though most parsers will do
  103. // the right thing if there aren't any characters that
  104. // can be misinterpreted
  105. // - Dashes rarely occur, but they fairly unproblematic
  106. // for parsing/rendering purposes.
  107. // The above characters cover the majority of Western font
  108. // names.
  109. // - Arbitrary Unicode characters not in ASCII. Because
  110. // most parsers give little thought to Unicode, treatment
  111. // of these codepoints is basically uniform, even for
  112. // punctuation-like codepoints. These characters can
  113. // show up in non-Western pages and are supported by most
  114. // major browsers, for example: "MS 明朝" is a
  115. // legitimate font-name
  116. // <http://ja.wikipedia.org/wiki/MS_明朝>. See
  117. // the CSS3 spec for more examples:
  118. // <http://www.w3.org/TR/2011/WD-css3-fonts-20110324/localizedfamilynames.png>
  119. // You can see live samples of these on the Internet:
  120. // <http://www.google.co.jp/search?q=font-family+MS+明朝|ゴシック>
  121. // However, most of these fonts have ASCII equivalents:
  122. // for example, 'MS Mincho', and it's considered
  123. // professional to use ASCII font names instead of
  124. // Unicode font names. Thanks Takeshi Terada for
  125. // providing this information.
  126. // The following characters, to my knowledge, have not been
  127. // used to name font names.
  128. // - Single quote. While theoretically you might find a
  129. // font name that has a single quote in its name (serving
  130. // as an apostrophe, e.g. Dave's Scribble), I haven't
  131. // been able to find any actual examples of this.
  132. // Internet Explorer's cssText translation (which I
  133. // believe is invoked by innerHTML) normalizes any
  134. // quoting to single quotes, and fails to escape single
  135. // quotes. (Note that this is not IE's behavior for all
  136. // CSS properties, just some sort of special casing for
  137. // font-family). So a single quote *cannot* be used
  138. // safely in the font-family context if there will be an
  139. // innerHTML/cssText translation. Note that Firefox 3.x
  140. // does this too.
  141. // - Double quote. In IE, these get normalized to
  142. // single-quotes, no matter what the encoding. (Fun
  143. // fact, in IE8, the 'content' CSS property gained
  144. // support, where they special cased to preserve encoded
  145. // double quotes, but still translate unadorned double
  146. // quotes into single quotes.) So, because their
  147. // fixpoint behavior is identical to single quotes, they
  148. // cannot be allowed either. Firefox 3.x displays
  149. // single-quote style behavior.
  150. // - Backslashes are reduced by one (so \\ -> \) every
  151. // iteration, so they cannot be used safely. This shows
  152. // up in IE7, IE8 and FF3
  153. // - Semicolons, commas and backticks are handled properly.
  154. // - The rest of the ASCII punctuation is handled properly.
  155. // We haven't checked what browsers do to unadorned
  156. // versions, but this is not important as long as the
  157. // browser doesn't /remove/ surrounding quotes (as IE does
  158. // for HTML).
  159. //
  160. // With these results in hand, we conclude that there are
  161. // various levels of safety:
  162. // - Paranoid: alphanumeric, spaces and dashes(?)
  163. // - International: Paranoid + non-ASCII Unicode
  164. // - Edgy: Everything except quotes, backslashes
  165. // - NoJS: Standards compliance, e.g. sod IE. Note that
  166. // with some judicious character escaping (since certain
  167. // types of escaping doesn't work) this is theoretically
  168. // OK as long as innerHTML/cssText is not called.
  169. // We believe that international is a reasonable default
  170. // (that we will implement now), and once we do more
  171. // extensive research, we may feel comfortable with dropping
  172. // it down to edgy.
  173. // Edgy: alphanumeric, spaces, dashes, underscores and Unicode. Use of
  174. // str(c)spn assumes that the string was already well formed
  175. // Unicode (which of course it is).
  176. if (strspn($font, $this->mask) !== strlen($font)) {
  177. continue;
  178. }
  179. // Historical:
  180. // In the absence of innerHTML/cssText, these ugly
  181. // transforms don't pose a security risk (as \\ and \"
  182. // might--these escapes are not supported by most browsers).
  183. // We could try to be clever and use single-quote wrapping
  184. // when there is a double quote present, but I have choosen
  185. // not to implement that. (NOTE: you can reduce the amount
  186. // of escapes by one depending on what quoting style you use)
  187. // $font = str_replace('\\', '\\5C ', $font);
  188. // $font = str_replace('"', '\\22 ', $font);
  189. // $font = str_replace("'", '\\27 ', $font);
  190. // font possibly with spaces, requires quoting
  191. $final .= "'$font', ";
  192. }
  193. $final = rtrim($final, ', ');
  194. if ($final === '') {
  195. return false;
  196. }
  197. return $final;
  198. }
  199. }
  200. // vim: et sw=4 sts=4