AutoParagraph.php 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356
  1. <?php
  2. /**
  3. * Injector that auto paragraphs text in the root node based on
  4. * double-spacing.
  5. * @todo Ensure all states are unit tested, including variations as well.
  6. * @todo Make a graph of the flow control for this Injector.
  7. */
  8. class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
  9. {
  10. /**
  11. * @type string
  12. */
  13. public $name = 'AutoParagraph';
  14. /**
  15. * @type array
  16. */
  17. public $needed = array('p');
  18. /**
  19. * @return HTMLPurifier_Token_Start
  20. */
  21. private function _pStart()
  22. {
  23. $par = new HTMLPurifier_Token_Start('p');
  24. $par->armor['MakeWellFormed_TagClosedError'] = true;
  25. return $par;
  26. }
  27. /**
  28. * @param HTMLPurifier_Token_Text $token
  29. */
  30. public function handleText(&$token)
  31. {
  32. $text = $token->data;
  33. // Does the current parent allow <p> tags?
  34. if ($this->allowsElement('p')) {
  35. if (empty($this->currentNesting) || strpos($text, "\n\n") !== false) {
  36. // Note that we have differing behavior when dealing with text
  37. // in the anonymous root node, or a node inside the document.
  38. // If the text as a double-newline, the treatment is the same;
  39. // if it doesn't, see the next if-block if you're in the document.
  40. $i = $nesting = null;
  41. if (!$this->forwardUntilEndToken($i, $current, $nesting) && $token->is_whitespace) {
  42. // State 1.1: ... ^ (whitespace, then document end)
  43. // ----
  44. // This is a degenerate case
  45. } else {
  46. if (!$token->is_whitespace || $this->_isInline($current)) {
  47. // State 1.2: PAR1
  48. // ----
  49. // State 1.3: PAR1\n\nPAR2
  50. // ------------
  51. // State 1.4: <div>PAR1\n\nPAR2 (see State 2)
  52. // ------------
  53. $token = array($this->_pStart());
  54. $this->_splitText($text, $token);
  55. } else {
  56. // State 1.5: \n<hr />
  57. // --
  58. }
  59. }
  60. } else {
  61. // State 2: <div>PAR1... (similar to 1.4)
  62. // ----
  63. // We're in an element that allows paragraph tags, but we're not
  64. // sure if we're going to need them.
  65. if ($this->_pLookAhead()) {
  66. // State 2.1: <div>PAR1<b>PAR1\n\nPAR2
  67. // ----
  68. // Note: This will always be the first child, since any
  69. // previous inline element would have triggered this very
  70. // same routine, and found the double newline. One possible
  71. // exception would be a comment.
  72. $token = array($this->_pStart(), $token);
  73. } else {
  74. // State 2.2.1: <div>PAR1<div>
  75. // ----
  76. // State 2.2.2: <div>PAR1<b>PAR1</b></div>
  77. // ----
  78. }
  79. }
  80. // Is the current parent a <p> tag?
  81. } elseif (!empty($this->currentNesting) &&
  82. $this->currentNesting[count($this->currentNesting) - 1]->name == 'p') {
  83. // State 3.1: ...<p>PAR1
  84. // ----
  85. // State 3.2: ...<p>PAR1\n\nPAR2
  86. // ------------
  87. $token = array();
  88. $this->_splitText($text, $token);
  89. // Abort!
  90. } else {
  91. // State 4.1: ...<b>PAR1
  92. // ----
  93. // State 4.2: ...<b>PAR1\n\nPAR2
  94. // ------------
  95. }
  96. }
  97. /**
  98. * @param HTMLPurifier_Token $token
  99. */
  100. public function handleElement(&$token)
  101. {
  102. // We don't have to check if we're already in a <p> tag for block
  103. // tokens, because the tag would have been autoclosed by MakeWellFormed.
  104. if ($this->allowsElement('p')) {
  105. if (!empty($this->currentNesting)) {
  106. if ($this->_isInline($token)) {
  107. // State 1: <div>...<b>
  108. // ---
  109. // Check if this token is adjacent to the parent token
  110. // (seek backwards until token isn't whitespace)
  111. $i = null;
  112. $this->backward($i, $prev);
  113. if (!$prev instanceof HTMLPurifier_Token_Start) {
  114. // Token wasn't adjacent
  115. if ($prev instanceof HTMLPurifier_Token_Text &&
  116. substr($prev->data, -2) === "\n\n"
  117. ) {
  118. // State 1.1.4: <div><p>PAR1</p>\n\n<b>
  119. // ---
  120. // Quite frankly, this should be handled by splitText
  121. $token = array($this->_pStart(), $token);
  122. } else {
  123. // State 1.1.1: <div><p>PAR1</p><b>
  124. // ---
  125. // State 1.1.2: <div><br /><b>
  126. // ---
  127. // State 1.1.3: <div>PAR<b>
  128. // ---
  129. }
  130. } else {
  131. // State 1.2.1: <div><b>
  132. // ---
  133. // Lookahead to see if <p> is needed.
  134. if ($this->_pLookAhead()) {
  135. // State 1.3.1: <div><b>PAR1\n\nPAR2
  136. // ---
  137. $token = array($this->_pStart(), $token);
  138. } else {
  139. // State 1.3.2: <div><b>PAR1</b></div>
  140. // ---
  141. // State 1.3.3: <div><b>PAR1</b><div></div>\n\n</div>
  142. // ---
  143. }
  144. }
  145. } else {
  146. // State 2.3: ...<div>
  147. // -----
  148. }
  149. } else {
  150. if ($this->_isInline($token)) {
  151. // State 3.1: <b>
  152. // ---
  153. // This is where the {p} tag is inserted, not reflected in
  154. // inputTokens yet, however.
  155. $token = array($this->_pStart(), $token);
  156. } else {
  157. // State 3.2: <div>
  158. // -----
  159. }
  160. $i = null;
  161. if ($this->backward($i, $prev)) {
  162. if (!$prev instanceof HTMLPurifier_Token_Text) {
  163. // State 3.1.1: ...</p>{p}<b>
  164. // ---
  165. // State 3.2.1: ...</p><div>
  166. // -----
  167. if (!is_array($token)) {
  168. $token = array($token);
  169. }
  170. array_unshift($token, new HTMLPurifier_Token_Text("\n\n"));
  171. } else {
  172. // State 3.1.2: ...</p>\n\n{p}<b>
  173. // ---
  174. // State 3.2.2: ...</p>\n\n<div>
  175. // -----
  176. // Note: PAR<ELEM> cannot occur because PAR would have been
  177. // wrapped in <p> tags.
  178. }
  179. }
  180. }
  181. } else {
  182. // State 2.2: <ul><li>
  183. // ----
  184. // State 2.4: <p><b>
  185. // ---
  186. }
  187. }
  188. /**
  189. * Splits up a text in paragraph tokens and appends them
  190. * to the result stream that will replace the original
  191. * @param string $data String text data that will be processed
  192. * into paragraphs
  193. * @param HTMLPurifier_Token[] $result Reference to array of tokens that the
  194. * tags will be appended onto
  195. */
  196. private function _splitText($data, &$result)
  197. {
  198. $raw_paragraphs = explode("\n\n", $data);
  199. $paragraphs = array(); // without empty paragraphs
  200. $needs_start = false;
  201. $needs_end = false;
  202. $c = count($raw_paragraphs);
  203. if ($c == 1) {
  204. // There were no double-newlines, abort quickly. In theory this
  205. // should never happen.
  206. $result[] = new HTMLPurifier_Token_Text($data);
  207. return;
  208. }
  209. for ($i = 0; $i < $c; $i++) {
  210. $par = $raw_paragraphs[$i];
  211. if (trim($par) !== '') {
  212. $paragraphs[] = $par;
  213. } else {
  214. if ($i == 0) {
  215. // Double newline at the front
  216. if (empty($result)) {
  217. // The empty result indicates that the AutoParagraph
  218. // injector did not add any start paragraph tokens.
  219. // This means that we have been in a paragraph for
  220. // a while, and the newline means we should start a new one.
  221. $result[] = new HTMLPurifier_Token_End('p');
  222. $result[] = new HTMLPurifier_Token_Text("\n\n");
  223. // However, the start token should only be added if
  224. // there is more processing to be done (i.e. there are
  225. // real paragraphs in here). If there are none, the
  226. // next start paragraph tag will be handled by the
  227. // next call to the injector
  228. $needs_start = true;
  229. } else {
  230. // We just started a new paragraph!
  231. // Reinstate a double-newline for presentation's sake, since
  232. // it was in the source code.
  233. array_unshift($result, new HTMLPurifier_Token_Text("\n\n"));
  234. }
  235. } elseif ($i + 1 == $c) {
  236. // Double newline at the end
  237. // There should be a trailing </p> when we're finally done.
  238. $needs_end = true;
  239. }
  240. }
  241. }
  242. // Check if this was just a giant blob of whitespace. Move this earlier,
  243. // perhaps?
  244. if (empty($paragraphs)) {
  245. return;
  246. }
  247. // Add the start tag indicated by \n\n at the beginning of $data
  248. if ($needs_start) {
  249. $result[] = $this->_pStart();
  250. }
  251. // Append the paragraphs onto the result
  252. foreach ($paragraphs as $par) {
  253. $result[] = new HTMLPurifier_Token_Text($par);
  254. $result[] = new HTMLPurifier_Token_End('p');
  255. $result[] = new HTMLPurifier_Token_Text("\n\n");
  256. $result[] = $this->_pStart();
  257. }
  258. // Remove trailing start token; Injector will handle this later if
  259. // it was indeed needed. This prevents from needing to do a lookahead,
  260. // at the cost of a lookbehind later.
  261. array_pop($result);
  262. // If there is no need for an end tag, remove all of it and let
  263. // MakeWellFormed close it later.
  264. if (!$needs_end) {
  265. array_pop($result); // removes \n\n
  266. array_pop($result); // removes </p>
  267. }
  268. }
  269. /**
  270. * Returns true if passed token is inline (and, ergo, allowed in
  271. * paragraph tags)
  272. * @param HTMLPurifier_Token $token
  273. * @return bool
  274. */
  275. private function _isInline($token)
  276. {
  277. return isset($this->htmlDefinition->info['p']->child->elements[$token->name]);
  278. }
  279. /**
  280. * Looks ahead in the token list and determines whether or not we need
  281. * to insert a <p> tag.
  282. * @return bool
  283. */
  284. private function _pLookAhead()
  285. {
  286. if ($this->currentToken instanceof HTMLPurifier_Token_Start) {
  287. $nesting = 1;
  288. } else {
  289. $nesting = 0;
  290. }
  291. $ok = false;
  292. $i = null;
  293. while ($this->forwardUntilEndToken($i, $current, $nesting)) {
  294. $result = $this->_checkNeedsP($current);
  295. if ($result !== null) {
  296. $ok = $result;
  297. break;
  298. }
  299. }
  300. return $ok;
  301. }
  302. /**
  303. * Determines if a particular token requires an earlier inline token
  304. * to get a paragraph. This should be used with _forwardUntilEndToken
  305. * @param HTMLPurifier_Token $current
  306. * @return bool
  307. */
  308. private function _checkNeedsP($current)
  309. {
  310. if ($current instanceof HTMLPurifier_Token_Start) {
  311. if (!$this->_isInline($current)) {
  312. // <div>PAR1<div>
  313. // ----
  314. // Terminate early, since we hit a block element
  315. return false;
  316. }
  317. } elseif ($current instanceof HTMLPurifier_Token_Text) {
  318. if (strpos($current->data, "\n\n") !== false) {
  319. // <div>PAR1<b>PAR1\n\nPAR2
  320. // ----
  321. return true;
  322. } else {
  323. // <div>PAR1<b>PAR1...
  324. // ----
  325. }
  326. }
  327. return null;
  328. }
  329. }
  330. // vim: et sw=4 sts=4