Escaper.php 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392
  1. <?php
  2. /**
  3. * Zend Framework (http://framework.zend.com/)
  4. *
  5. * @link http://github.com/zendframework/zf2 for the canonical source repository
  6. * @copyright Copyright (c) 2005-2015 Zend Technologies USA Inc. (http://www.zend.com)
  7. * @license http://framework.zend.com/license/new-bsd New BSD License
  8. */
  9. namespace Zend\Escaper;
  10. /**
  11. * Context specific methods for use in secure output escaping
  12. */
  13. class Escaper
  14. {
  15. /**
  16. * Entity Map mapping Unicode codepoints to any available named HTML entities.
  17. *
  18. * While HTML supports far more named entities, the lowest common denominator
  19. * has become HTML5's XML Serialisation which is restricted to the those named
  20. * entities that XML supports. Using HTML entities would result in this error:
  21. * XML Parsing Error: undefined entity
  22. *
  23. * @var array
  24. */
  25. protected static $htmlNamedEntityMap = [
  26. 34 => 'quot', // quotation mark
  27. 38 => 'amp', // ampersand
  28. 60 => 'lt', // less-than sign
  29. 62 => 'gt', // greater-than sign
  30. ];
  31. /**
  32. * Current encoding for escaping. If not UTF-8, we convert strings from this encoding
  33. * pre-escaping and back to this encoding post-escaping.
  34. *
  35. * @var string
  36. */
  37. protected $encoding = 'utf-8';
  38. /**
  39. * Holds the value of the special flags passed as second parameter to
  40. * htmlspecialchars().
  41. *
  42. * @var int
  43. */
  44. protected $htmlSpecialCharsFlags;
  45. /**
  46. * Static Matcher which escapes characters for HTML Attribute contexts
  47. *
  48. * @var callable
  49. */
  50. protected $htmlAttrMatcher;
  51. /**
  52. * Static Matcher which escapes characters for Javascript contexts
  53. *
  54. * @var callable
  55. */
  56. protected $jsMatcher;
  57. /**
  58. * Static Matcher which escapes characters for CSS Attribute contexts
  59. *
  60. * @var callable
  61. */
  62. protected $cssMatcher;
  63. /**
  64. * List of all encoding supported by this class
  65. *
  66. * @var array
  67. */
  68. protected $supportedEncodings = [
  69. 'iso-8859-1', 'iso8859-1', 'iso-8859-5', 'iso8859-5',
  70. 'iso-8859-15', 'iso8859-15', 'utf-8', 'cp866',
  71. 'ibm866', '866', 'cp1251', 'windows-1251',
  72. 'win-1251', '1251', 'cp1252', 'windows-1252',
  73. '1252', 'koi8-r', 'koi8-ru', 'koi8r',
  74. 'big5', '950', 'gb2312', '936',
  75. 'big5-hkscs', 'shift_jis', 'sjis', 'sjis-win',
  76. 'cp932', '932', 'euc-jp', 'eucjp',
  77. 'eucjp-win', 'macroman'
  78. ];
  79. /**
  80. * Constructor: Single parameter allows setting of global encoding for use by
  81. * the current object.
  82. *
  83. * @param string $encoding
  84. * @throws Exception\InvalidArgumentException
  85. */
  86. public function __construct($encoding = null)
  87. {
  88. if ($encoding !== null) {
  89. if (! is_string($encoding)) {
  90. throw new Exception\InvalidArgumentException(
  91. get_class($this) . ' constructor parameter must be a string, received ' . gettype($encoding)
  92. );
  93. }
  94. if ($encoding === '') {
  95. throw new Exception\InvalidArgumentException(
  96. get_class($this) . ' constructor parameter does not allow a blank value'
  97. );
  98. }
  99. $encoding = strtolower($encoding);
  100. if (! in_array($encoding, $this->supportedEncodings)) {
  101. throw new Exception\InvalidArgumentException(
  102. 'Value of \'' . $encoding . '\' passed to ' . get_class($this)
  103. . ' constructor parameter is invalid. Provide an encoding supported by htmlspecialchars()'
  104. );
  105. }
  106. $this->encoding = $encoding;
  107. }
  108. // We take advantage of ENT_SUBSTITUTE flag to correctly deal with invalid UTF-8 sequences.
  109. $this->htmlSpecialCharsFlags = ENT_QUOTES | ENT_SUBSTITUTE;
  110. // set matcher callbacks
  111. $this->htmlAttrMatcher = [$this, 'htmlAttrMatcher'];
  112. $this->jsMatcher = [$this, 'jsMatcher'];
  113. $this->cssMatcher = [$this, 'cssMatcher'];
  114. }
  115. /**
  116. * Return the encoding that all output/input is expected to be encoded in.
  117. *
  118. * @return string
  119. */
  120. public function getEncoding()
  121. {
  122. return $this->encoding;
  123. }
  124. /**
  125. * Escape a string for the HTML Body context where there are very few characters
  126. * of special meaning. Internally this will use htmlspecialchars().
  127. *
  128. * @param string $string
  129. * @return string
  130. */
  131. public function escapeHtml($string)
  132. {
  133. return htmlspecialchars($string, $this->htmlSpecialCharsFlags, $this->encoding);
  134. }
  135. /**
  136. * Escape a string for the HTML Attribute context. We use an extended set of characters
  137. * to escape that are not covered by htmlspecialchars() to cover cases where an attribute
  138. * might be unquoted or quoted illegally (e.g. backticks are valid quotes for IE).
  139. *
  140. * @param string $string
  141. * @return string
  142. */
  143. public function escapeHtmlAttr($string)
  144. {
  145. $string = $this->toUtf8($string);
  146. if ($string === '' || ctype_digit($string)) {
  147. return $string;
  148. }
  149. $result = preg_replace_callback('/[^a-z0-9,\.\-_]/iSu', $this->htmlAttrMatcher, $string);
  150. return $this->fromUtf8($result);
  151. }
  152. /**
  153. * Escape a string for the Javascript context. This does not use json_encode(). An extended
  154. * set of characters are escaped beyond ECMAScript's rules for Javascript literal string
  155. * escaping in order to prevent misinterpretation of Javascript as HTML leading to the
  156. * injection of special characters and entities. The escaping used should be tolerant
  157. * of cases where HTML escaping was not applied on top of Javascript escaping correctly.
  158. * Backslash escaping is not used as it still leaves the escaped character as-is and so
  159. * is not useful in a HTML context.
  160. *
  161. * @param string $string
  162. * @return string
  163. */
  164. public function escapeJs($string)
  165. {
  166. $string = $this->toUtf8($string);
  167. if ($string === '' || ctype_digit($string)) {
  168. return $string;
  169. }
  170. $result = preg_replace_callback('/[^a-z0-9,\._]/iSu', $this->jsMatcher, $string);
  171. return $this->fromUtf8($result);
  172. }
  173. /**
  174. * Escape a string for the URI or Parameter contexts. This should not be used to escape
  175. * an entire URI - only a subcomponent being inserted. The function is a simple proxy
  176. * to rawurlencode() which now implements RFC 3986 since PHP 5.3 completely.
  177. *
  178. * @param string $string
  179. * @return string
  180. */
  181. public function escapeUrl($string)
  182. {
  183. return rawurlencode($string);
  184. }
  185. /**
  186. * Escape a string for the CSS context. CSS escaping can be applied to any string being
  187. * inserted into CSS and escapes everything except alphanumerics.
  188. *
  189. * @param string $string
  190. * @return string
  191. */
  192. public function escapeCss($string)
  193. {
  194. $string = $this->toUtf8($string);
  195. if ($string === '' || ctype_digit($string)) {
  196. return $string;
  197. }
  198. $result = preg_replace_callback('/[^a-z0-9]/iSu', $this->cssMatcher, $string);
  199. return $this->fromUtf8($result);
  200. }
  201. /**
  202. * Callback function for preg_replace_callback that applies HTML Attribute
  203. * escaping to all matches.
  204. *
  205. * @param array $matches
  206. * @return string
  207. */
  208. protected function htmlAttrMatcher($matches)
  209. {
  210. $chr = $matches[0];
  211. $ord = ord($chr);
  212. /**
  213. * The following replaces characters undefined in HTML with the
  214. * hex entity for the Unicode replacement character.
  215. */
  216. if (($ord <= 0x1f && $chr != "\t" && $chr != "\n" && $chr != "\r")
  217. || ($ord >= 0x7f && $ord <= 0x9f)
  218. ) {
  219. return '&#xFFFD;';
  220. }
  221. /**
  222. * Check if the current character to escape has a name entity we should
  223. * replace it with while grabbing the integer value of the character.
  224. */
  225. if (strlen($chr) > 1) {
  226. $chr = $this->convertEncoding($chr, 'UTF-32BE', 'UTF-8');
  227. }
  228. $hex = bin2hex($chr);
  229. $ord = hexdec($hex);
  230. if (isset(static::$htmlNamedEntityMap[$ord])) {
  231. return '&' . static::$htmlNamedEntityMap[$ord] . ';';
  232. }
  233. /**
  234. * Per OWASP recommendations, we'll use upper hex entities
  235. * for any other characters where a named entity does not exist.
  236. */
  237. if ($ord > 255) {
  238. return sprintf('&#x%04X;', $ord);
  239. }
  240. return sprintf('&#x%02X;', $ord);
  241. }
  242. /**
  243. * Callback function for preg_replace_callback that applies Javascript
  244. * escaping to all matches.
  245. *
  246. * @param array $matches
  247. * @return string
  248. */
  249. protected function jsMatcher($matches)
  250. {
  251. $chr = $matches[0];
  252. if (strlen($chr) == 1) {
  253. return sprintf('\\x%02X', ord($chr));
  254. }
  255. $chr = $this->convertEncoding($chr, 'UTF-16BE', 'UTF-8');
  256. $hex = strtoupper(bin2hex($chr));
  257. if (strlen($hex) <= 4) {
  258. return sprintf('\\u%04s', $hex);
  259. }
  260. $highSurrogate = substr($hex, 0, 4);
  261. $lowSurrogate = substr($hex, 4, 4);
  262. return sprintf('\\u%04s\\u%04s', $highSurrogate, $lowSurrogate);
  263. }
  264. /**
  265. * Callback function for preg_replace_callback that applies CSS
  266. * escaping to all matches.
  267. *
  268. * @param array $matches
  269. * @return string
  270. */
  271. protected function cssMatcher($matches)
  272. {
  273. $chr = $matches[0];
  274. if (strlen($chr) == 1) {
  275. $ord = ord($chr);
  276. } else {
  277. $chr = $this->convertEncoding($chr, 'UTF-32BE', 'UTF-8');
  278. $ord = hexdec(bin2hex($chr));
  279. }
  280. return sprintf('\\%X ', $ord);
  281. }
  282. /**
  283. * Converts a string to UTF-8 from the base encoding. The base encoding is set via this
  284. * class' constructor.
  285. *
  286. * @param string $string
  287. * @throws Exception\RuntimeException
  288. * @return string
  289. */
  290. protected function toUtf8($string)
  291. {
  292. if ($this->getEncoding() === 'utf-8') {
  293. $result = $string;
  294. } else {
  295. $result = $this->convertEncoding($string, 'UTF-8', $this->getEncoding());
  296. }
  297. if (! $this->isUtf8($result)) {
  298. throw new Exception\RuntimeException(
  299. sprintf('String to be escaped was not valid UTF-8 or could not be converted: %s', $result)
  300. );
  301. }
  302. return $result;
  303. }
  304. /**
  305. * Converts a string from UTF-8 to the base encoding. The base encoding is set via this
  306. * class' constructor.
  307. * @param string $string
  308. * @return string
  309. */
  310. protected function fromUtf8($string)
  311. {
  312. if ($this->getEncoding() === 'utf-8') {
  313. return $string;
  314. }
  315. return $this->convertEncoding($string, $this->getEncoding(), 'UTF-8');
  316. }
  317. /**
  318. * Checks if a given string appears to be valid UTF-8 or not.
  319. *
  320. * @param string $string
  321. * @return bool
  322. */
  323. protected function isUtf8($string)
  324. {
  325. return ($string === '' || preg_match('/^./su', $string));
  326. }
  327. /**
  328. * Encoding conversion helper which wraps iconv and mbstring where they exist or throws
  329. * and exception where neither is available.
  330. *
  331. * @param string $string
  332. * @param string $to
  333. * @param array|string $from
  334. * @throws Exception\RuntimeException
  335. * @return string
  336. */
  337. protected function convertEncoding($string, $to, $from)
  338. {
  339. if (function_exists('iconv')) {
  340. $result = iconv($from, $to, $string);
  341. } elseif (function_exists('mb_convert_encoding')) {
  342. $result = mb_convert_encoding($string, $to, $from);
  343. } else {
  344. throw new Exception\RuntimeException(
  345. get_class($this)
  346. . ' requires either the iconv or mbstring extension to be installed'
  347. . ' when escaping for non UTF-8 strings.'
  348. );
  349. }
  350. if ($result === false) {
  351. return ''; // return non-fatal blank string on encoding errors from users
  352. }
  353. return $result;
  354. }
  355. }