DirectLex.php 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539
  1. <?php
  2. /**
  3. * Our in-house implementation of a parser.
  4. *
  5. * A pure PHP parser, DirectLex has absolutely no dependencies, making
  6. * it a reasonably good default for PHP4. Written with efficiency in mind,
  7. * it can be four times faster than HTMLPurifier_Lexer_PEARSax3, although it
  8. * pales in comparison to HTMLPurifier_Lexer_DOMLex.
  9. *
  10. * @todo Reread XML spec and document differences.
  11. */
  12. class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
  13. {
  14. /**
  15. * @type bool
  16. */
  17. public $tracksLineNumbers = true;
  18. /**
  19. * Whitespace characters for str(c)spn.
  20. * @type string
  21. */
  22. protected $_whitespace = "\x20\x09\x0D\x0A";
  23. /**
  24. * Callback function for script CDATA fudge
  25. * @param array $matches, in form of array(opening tag, contents, closing tag)
  26. * @return string
  27. */
  28. protected function scriptCallback($matches)
  29. {
  30. return $matches[1] . htmlspecialchars($matches[2], ENT_COMPAT, 'UTF-8') . $matches[3];
  31. }
  32. /**
  33. * @param String $html
  34. * @param HTMLPurifier_Config $config
  35. * @param HTMLPurifier_Context $context
  36. * @return array|HTMLPurifier_Token[]
  37. */
  38. public function tokenizeHTML($html, $config, $context)
  39. {
  40. // special normalization for script tags without any armor
  41. // our "armor" heurstic is a < sign any number of whitespaces after
  42. // the first script tag
  43. if ($config->get('HTML.Trusted')) {
  44. $html = preg_replace_callback(
  45. '#(<script[^>]*>)(\s*[^<].+?)(</script>)#si',
  46. array($this, 'scriptCallback'),
  47. $html
  48. );
  49. }
  50. $html = $this->normalize($html, $config, $context);
  51. $cursor = 0; // our location in the text
  52. $inside_tag = false; // whether or not we're parsing the inside of a tag
  53. $array = array(); // result array
  54. // This is also treated to mean maintain *column* numbers too
  55. $maintain_line_numbers = $config->get('Core.MaintainLineNumbers');
  56. if ($maintain_line_numbers === null) {
  57. // automatically determine line numbering by checking
  58. // if error collection is on
  59. $maintain_line_numbers = $config->get('Core.CollectErrors');
  60. }
  61. if ($maintain_line_numbers) {
  62. $current_line = 1;
  63. $current_col = 0;
  64. $length = strlen($html);
  65. } else {
  66. $current_line = false;
  67. $current_col = false;
  68. $length = false;
  69. }
  70. $context->register('CurrentLine', $current_line);
  71. $context->register('CurrentCol', $current_col);
  72. $nl = "\n";
  73. // how often to manually recalculate. This will ALWAYS be right,
  74. // but it's pretty wasteful. Set to 0 to turn off
  75. $synchronize_interval = $config->get('Core.DirectLexLineNumberSyncInterval');
  76. $e = false;
  77. if ($config->get('Core.CollectErrors')) {
  78. $e =& $context->get('ErrorCollector');
  79. }
  80. // for testing synchronization
  81. $loops = 0;
  82. while (++$loops) {
  83. // $cursor is either at the start of a token, or inside of
  84. // a tag (i.e. there was a < immediately before it), as indicated
  85. // by $inside_tag
  86. if ($maintain_line_numbers) {
  87. // $rcursor, however, is always at the start of a token.
  88. $rcursor = $cursor - (int)$inside_tag;
  89. // Column number is cheap, so we calculate it every round.
  90. // We're interested at the *end* of the newline string, so
  91. // we need to add strlen($nl) == 1 to $nl_pos before subtracting it
  92. // from our "rcursor" position.
  93. $nl_pos = strrpos($html, $nl, $rcursor - $length);
  94. $current_col = $rcursor - (is_bool($nl_pos) ? 0 : $nl_pos + 1);
  95. // recalculate lines
  96. if ($synchronize_interval && // synchronization is on
  97. $cursor > 0 && // cursor is further than zero
  98. $loops % $synchronize_interval === 0) { // time to synchronize!
  99. $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor);
  100. }
  101. }
  102. $position_next_lt = strpos($html, '<', $cursor);
  103. $position_next_gt = strpos($html, '>', $cursor);
  104. // triggers on "<b>asdf</b>" but not "asdf <b></b>"
  105. // special case to set up context
  106. if ($position_next_lt === $cursor) {
  107. $inside_tag = true;
  108. $cursor++;
  109. }
  110. if (!$inside_tag && $position_next_lt !== false) {
  111. // We are not inside tag and there still is another tag to parse
  112. $token = new
  113. HTMLPurifier_Token_Text(
  114. $this->parseText(
  115. substr(
  116. $html,
  117. $cursor,
  118. $position_next_lt - $cursor
  119. ), $config
  120. )
  121. );
  122. if ($maintain_line_numbers) {
  123. $token->rawPosition($current_line, $current_col);
  124. $current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor);
  125. }
  126. $array[] = $token;
  127. $cursor = $position_next_lt + 1;
  128. $inside_tag = true;
  129. continue;
  130. } elseif (!$inside_tag) {
  131. // We are not inside tag but there are no more tags
  132. // If we're already at the end, break
  133. if ($cursor === strlen($html)) {
  134. break;
  135. }
  136. // Create Text of rest of string
  137. $token = new
  138. HTMLPurifier_Token_Text(
  139. $this->parseText(
  140. substr(
  141. $html,
  142. $cursor
  143. ), $config
  144. )
  145. );
  146. if ($maintain_line_numbers) {
  147. $token->rawPosition($current_line, $current_col);
  148. }
  149. $array[] = $token;
  150. break;
  151. } elseif ($inside_tag && $position_next_gt !== false) {
  152. // We are in tag and it is well formed
  153. // Grab the internals of the tag
  154. $strlen_segment = $position_next_gt - $cursor;
  155. if ($strlen_segment < 1) {
  156. // there's nothing to process!
  157. $token = new HTMLPurifier_Token_Text('<');
  158. $cursor++;
  159. continue;
  160. }
  161. $segment = substr($html, $cursor, $strlen_segment);
  162. if ($segment === false) {
  163. // somehow, we attempted to access beyond the end of
  164. // the string, defense-in-depth, reported by Nate Abele
  165. break;
  166. }
  167. // Check if it's a comment
  168. if (substr($segment, 0, 3) === '!--') {
  169. // re-determine segment length, looking for -->
  170. $position_comment_end = strpos($html, '-->', $cursor);
  171. if ($position_comment_end === false) {
  172. // uh oh, we have a comment that extends to
  173. // infinity. Can't be helped: set comment
  174. // end position to end of string
  175. if ($e) {
  176. $e->send(E_WARNING, 'Lexer: Unclosed comment');
  177. }
  178. $position_comment_end = strlen($html);
  179. $end = true;
  180. } else {
  181. $end = false;
  182. }
  183. $strlen_segment = $position_comment_end - $cursor;
  184. $segment = substr($html, $cursor, $strlen_segment);
  185. $token = new
  186. HTMLPurifier_Token_Comment(
  187. substr(
  188. $segment,
  189. 3,
  190. $strlen_segment - 3
  191. )
  192. );
  193. if ($maintain_line_numbers) {
  194. $token->rawPosition($current_line, $current_col);
  195. $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment);
  196. }
  197. $array[] = $token;
  198. $cursor = $end ? $position_comment_end : $position_comment_end + 3;
  199. $inside_tag = false;
  200. continue;
  201. }
  202. // Check if it's an end tag
  203. $is_end_tag = (strpos($segment, '/') === 0);
  204. if ($is_end_tag) {
  205. $type = substr($segment, 1);
  206. $token = new HTMLPurifier_Token_End($type);
  207. if ($maintain_line_numbers) {
  208. $token->rawPosition($current_line, $current_col);
  209. $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
  210. }
  211. $array[] = $token;
  212. $inside_tag = false;
  213. $cursor = $position_next_gt + 1;
  214. continue;
  215. }
  216. // Check leading character is alnum, if not, we may
  217. // have accidently grabbed an emoticon. Translate into
  218. // text and go our merry way
  219. if (!ctype_alpha($segment[0])) {
  220. // XML: $segment[0] !== '_' && $segment[0] !== ':'
  221. if ($e) {
  222. $e->send(E_NOTICE, 'Lexer: Unescaped lt');
  223. }
  224. $token = new HTMLPurifier_Token_Text('<');
  225. if ($maintain_line_numbers) {
  226. $token->rawPosition($current_line, $current_col);
  227. $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
  228. }
  229. $array[] = $token;
  230. $inside_tag = false;
  231. continue;
  232. }
  233. // Check if it is explicitly self closing, if so, remove
  234. // trailing slash. Remember, we could have a tag like <br>, so
  235. // any later token processing scripts must convert improperly
  236. // classified EmptyTags from StartTags.
  237. $is_self_closing = (strrpos($segment, '/') === $strlen_segment - 1);
  238. if ($is_self_closing) {
  239. $strlen_segment--;
  240. $segment = substr($segment, 0, $strlen_segment);
  241. }
  242. // Check if there are any attributes
  243. $position_first_space = strcspn($segment, $this->_whitespace);
  244. if ($position_first_space >= $strlen_segment) {
  245. if ($is_self_closing) {
  246. $token = new HTMLPurifier_Token_Empty($segment);
  247. } else {
  248. $token = new HTMLPurifier_Token_Start($segment);
  249. }
  250. if ($maintain_line_numbers) {
  251. $token->rawPosition($current_line, $current_col);
  252. $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
  253. }
  254. $array[] = $token;
  255. $inside_tag = false;
  256. $cursor = $position_next_gt + 1;
  257. continue;
  258. }
  259. // Grab out all the data
  260. $type = substr($segment, 0, $position_first_space);
  261. $attribute_string =
  262. trim(
  263. substr(
  264. $segment,
  265. $position_first_space
  266. )
  267. );
  268. if ($attribute_string) {
  269. $attr = $this->parseAttributeString(
  270. $attribute_string,
  271. $config,
  272. $context
  273. );
  274. } else {
  275. $attr = array();
  276. }
  277. if ($is_self_closing) {
  278. $token = new HTMLPurifier_Token_Empty($type, $attr);
  279. } else {
  280. $token = new HTMLPurifier_Token_Start($type, $attr);
  281. }
  282. if ($maintain_line_numbers) {
  283. $token->rawPosition($current_line, $current_col);
  284. $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
  285. }
  286. $array[] = $token;
  287. $cursor = $position_next_gt + 1;
  288. $inside_tag = false;
  289. continue;
  290. } else {
  291. // inside tag, but there's no ending > sign
  292. if ($e) {
  293. $e->send(E_WARNING, 'Lexer: Missing gt');
  294. }
  295. $token = new
  296. HTMLPurifier_Token_Text(
  297. '<' .
  298. $this->parseText(
  299. substr($html, $cursor), $config
  300. )
  301. );
  302. if ($maintain_line_numbers) {
  303. $token->rawPosition($current_line, $current_col);
  304. }
  305. // no cursor scroll? Hmm...
  306. $array[] = $token;
  307. break;
  308. }
  309. break;
  310. }
  311. $context->destroy('CurrentLine');
  312. $context->destroy('CurrentCol');
  313. return $array;
  314. }
  315. /**
  316. * PHP 5.0.x compatible substr_count that implements offset and length
  317. * @param string $haystack
  318. * @param string $needle
  319. * @param int $offset
  320. * @param int $length
  321. * @return int
  322. */
  323. protected function substrCount($haystack, $needle, $offset, $length)
  324. {
  325. static $oldVersion;
  326. if ($oldVersion === null) {
  327. $oldVersion = version_compare(PHP_VERSION, '5.1', '<');
  328. }
  329. if ($oldVersion) {
  330. $haystack = substr($haystack, $offset, $length);
  331. return substr_count($haystack, $needle);
  332. } else {
  333. return substr_count($haystack, $needle, $offset, $length);
  334. }
  335. }
  336. /**
  337. * Takes the inside of an HTML tag and makes an assoc array of attributes.
  338. *
  339. * @param string $string Inside of tag excluding name.
  340. * @param HTMLPurifier_Config $config
  341. * @param HTMLPurifier_Context $context
  342. * @return array Assoc array of attributes.
  343. */
  344. public function parseAttributeString($string, $config, $context)
  345. {
  346. $string = (string)$string; // quick typecast
  347. if ($string == '') {
  348. return array();
  349. } // no attributes
  350. $e = false;
  351. if ($config->get('Core.CollectErrors')) {
  352. $e =& $context->get('ErrorCollector');
  353. }
  354. // let's see if we can abort as quickly as possible
  355. // one equal sign, no spaces => one attribute
  356. $num_equal = substr_count($string, '=');
  357. $has_space = strpos($string, ' ');
  358. if ($num_equal === 0 && !$has_space) {
  359. // bool attribute
  360. return array($string => $string);
  361. } elseif ($num_equal === 1 && !$has_space) {
  362. // only one attribute
  363. list($key, $quoted_value) = explode('=', $string);
  364. $quoted_value = trim($quoted_value);
  365. if (!$key) {
  366. if ($e) {
  367. $e->send(E_ERROR, 'Lexer: Missing attribute key');
  368. }
  369. return array();
  370. }
  371. if (!$quoted_value) {
  372. return array($key => '');
  373. }
  374. $first_char = @$quoted_value[0];
  375. $last_char = @$quoted_value[strlen($quoted_value) - 1];
  376. $same_quote = ($first_char == $last_char);
  377. $open_quote = ($first_char == '"' || $first_char == "'");
  378. if ($same_quote && $open_quote) {
  379. // well behaved
  380. $value = substr($quoted_value, 1, strlen($quoted_value) - 2);
  381. } else {
  382. // not well behaved
  383. if ($open_quote) {
  384. if ($e) {
  385. $e->send(E_ERROR, 'Lexer: Missing end quote');
  386. }
  387. $value = substr($quoted_value, 1);
  388. } else {
  389. $value = $quoted_value;
  390. }
  391. }
  392. if ($value === false) {
  393. $value = '';
  394. }
  395. return array($key => $this->parseAttr($value, $config));
  396. }
  397. // setup loop environment
  398. $array = array(); // return assoc array of attributes
  399. $cursor = 0; // current position in string (moves forward)
  400. $size = strlen($string); // size of the string (stays the same)
  401. // if we have unquoted attributes, the parser expects a terminating
  402. // space, so let's guarantee that there's always a terminating space.
  403. $string .= ' ';
  404. $old_cursor = -1;
  405. while ($cursor < $size) {
  406. if ($old_cursor >= $cursor) {
  407. throw new Exception("Infinite loop detected");
  408. }
  409. $old_cursor = $cursor;
  410. $cursor += ($value = strspn($string, $this->_whitespace, $cursor));
  411. // grab the key
  412. $key_begin = $cursor; //we're currently at the start of the key
  413. // scroll past all characters that are the key (not whitespace or =)
  414. $cursor += strcspn($string, $this->_whitespace . '=', $cursor);
  415. $key_end = $cursor; // now at the end of the key
  416. $key = substr($string, $key_begin, $key_end - $key_begin);
  417. if (!$key) {
  418. if ($e) {
  419. $e->send(E_ERROR, 'Lexer: Missing attribute key');
  420. }
  421. $cursor += 1 + strcspn($string, $this->_whitespace, $cursor + 1); // prevent infinite loop
  422. continue; // empty key
  423. }
  424. // scroll past all whitespace
  425. $cursor += strspn($string, $this->_whitespace, $cursor);
  426. if ($cursor >= $size) {
  427. $array[$key] = $key;
  428. break;
  429. }
  430. // if the next character is an equal sign, we've got a regular
  431. // pair, otherwise, it's a bool attribute
  432. $first_char = @$string[$cursor];
  433. if ($first_char == '=') {
  434. // key="value"
  435. $cursor++;
  436. $cursor += strspn($string, $this->_whitespace, $cursor);
  437. if ($cursor === false) {
  438. $array[$key] = '';
  439. break;
  440. }
  441. // we might be in front of a quote right now
  442. $char = @$string[$cursor];
  443. if ($char == '"' || $char == "'") {
  444. // it's quoted, end bound is $char
  445. $cursor++;
  446. $value_begin = $cursor;
  447. $cursor = strpos($string, $char, $cursor);
  448. $value_end = $cursor;
  449. } else {
  450. // it's not quoted, end bound is whitespace
  451. $value_begin = $cursor;
  452. $cursor += strcspn($string, $this->_whitespace, $cursor);
  453. $value_end = $cursor;
  454. }
  455. // we reached a premature end
  456. if ($cursor === false) {
  457. $cursor = $size;
  458. $value_end = $cursor;
  459. }
  460. $value = substr($string, $value_begin, $value_end - $value_begin);
  461. if ($value === false) {
  462. $value = '';
  463. }
  464. $array[$key] = $this->parseAttr($value, $config);
  465. $cursor++;
  466. } else {
  467. // boolattr
  468. if ($key !== '') {
  469. $array[$key] = $key;
  470. } else {
  471. // purely theoretical
  472. if ($e) {
  473. $e->send(E_ERROR, 'Lexer: Missing attribute key');
  474. }
  475. }
  476. }
  477. }
  478. return $array;
  479. }
  480. }
  481. // vim: et sw=4 sts=4