��ѧۧݧ�ӧ�� ާ֧ߧ֧էا֧�

��ѧۧݧ�ӧ�� ާ֧ߧ֧էا֧� - ��֧էѧܧ�ڧ��ӧѧ�� - /home/rickpfrv/wiki.craftaro.com/vendor/wikimedia/parsoid/src/Wt2Html/Grammar.pegphp

��ѧ٧ѧ�
/** * Combined Wiki (MediaWiki) and HTML tokenizer based on pegjs. Emits several * chunks of tokens (one chunk per top-level block matched) and eventually an * end event. Tokens map to HTML tags as far as possible, with custom tokens * used where further processing on the token stream is needed. / { use Wikimedia\Assert\UnreachableException; use Wikimedia\Parsoid\Config\Env; use Wikimedia\Parsoid\Config\SiteConfig; use Wikimedia\Parsoid\Core\DomSourceRange; use Wikimedia\Parsoid\NodeData\DataParsoid; use Wikimedia\Parsoid\Tokens\CommentTk; use Wikimedia\Parsoid\Tokens\EndTagTk; use Wikimedia\Parsoid\Tokens\EOFTk; use Wikimedia\Parsoid\Tokens\KV; use Wikimedia\Parsoid\Tokens\KVSourceRange; use Wikimedia\Parsoid\Tokens\NlTk; use Wikimedia\Parsoid\Tokens\SelfclosingTagTk; use Wikimedia\Parsoid\Tokens\SourceRange; use Wikimedia\Parsoid\Tokens\TagTk; use Wikimedia\Parsoid\Tokens\Token; use Wikimedia\Parsoid\Utils\TokenUtils; use Wikimedia\Parsoid\Utils\Utils; use Wikimedia\Parsoid\Utils\PHPUtils; use Wikimedia\Parsoid\Utils\WTUtils; use Wikimedia\Parsoid\Wikitext\Consts; } { /* @var Env / private $env; /* @var SiteConfig / private $siteConfig; /* @var array / private $pipelineOpts; /* @var int / private $pipelineOffset; private $extTags; private $startTime; /* @var string / private $reUrltextLookahead; /* @var string / private $urltextPlainSegment = ''; /* @var bool / private $urltextFoundAutolink = false; protected function initialize() { $this->env = $this->options['env']; $this->siteConfig = $this->env->getSiteConfig(); $tokenizer = $this->options['pegTokenizer']; $this->pipelineOpts = $tokenizer->getOptions(); // FIXME: inTemplate option may not always be set in // standalone tokenizers user by some pipelines handlers. $this->pipelineOffset = $this->options['pipelineOffset'] ?? 0; $this->extTags = $this->siteConfig->getExtensionTagNameMap(); // Non-greedy text_char sequence: stop at ampersand, double-underscore, // magic link prefix or protocol $this->reUrltextLookahead = '!(?:' . '([^-\'<[{\n\r:;\]}\|\!=&]?)' . '(?:__\|$\|[-\'<[{\n\r:;\]}\|\!=&]\|(RFC\|PMID\|ISBN\|' . '(?i)' . $this->siteConfig->getProtocolsRegex( true ) . ')))!A'; } private $prevOffset = 0; private $headingIndex = 0; private function assert( $condition, $text ) { if ( !$condition ) { throw new \Exception( "Grammar.pegphp assertion failure: $text" ); } } private function unreachable() { throw new UnreachableException( "Grammar.pegphp: this should be unreachable" ); } // Some shorthands for legibility private function startOffset() { return $this->savedPos; } private function endOffset() { return $this->currPos; } private function tsrOffsets( $flag = 'default' ): SourceRange { switch ( $flag ) { case 'start': return new SourceRange( $this->savedPos, $this->savedPos ); case 'end': return new SourceRange( $this->currPos, $this->currPos ); default: return new SourceRange( $this->savedPos, $this->currPos ); } } /* * Emit a chunk of tokens to our consumers. Once this has been done, the * current expression can return an empty list (true). / private function emitChunk( $tokens ) { // FIXME: We don't expect nulls here, but looks like // hack from I1c695ab6cdd3655e98877c175ddbabdee9dc44b7 // introduces them. Work around it for now! if ( !$tokens ) { return []; } // Shift tsr of all tokens by the pipeline offset TokenUtils::shiftTokenTSR( $tokens, $this->pipelineOffset ); $this->env->log( 'trace/peg', $this->options['pipelineId'] ?? '0', '----> ', $tokens ); $i = null; $n = count( $tokens ); // Enforce parsing resource limits for ( $i = 0; $i < $n; $i++ ) { TokenizerUtils::enforceParserResourceLimits( $this->env, $tokens[ $i ] ); } return $tokens; } / ------------------------------------------------------------------------ * Extension tags should be parsed with higher priority than anything else. * * The trick we use is to strip out the content inside a matching tag-pair * and not tokenize it. The content, if it needs to parsed (for example, * for <ref>, <include> tags), is parsed in a fresh tokenizer context * which means any error correction that needs to happen is restricted to * the scope of the extension content and doesn't spill over to the higher * level. Ex: <math><!--foo</math>. * * IGNORE: {{ this just balances the blocks in this comment for pegjs * * This trick also lets us prevent extension content (that don't accept WT) * from being parsed as wikitext (Ex: <math>\frac{foo\frac{bar}}</math>) * We don't want the "}}" being treated as a template closing tag and * closing outer templates. * --------------------------------------------------------------------- / private function isXMLTag( string $name, bool $block ): bool { $lName = mb_strtolower( $name ); return $block ? TokenUtils::isWikitextBlockTag( $lName ) : isset( Consts::$HTML['HTML5Tags'][$lName] ) \|\| isset( Consts::$HTML['OlderHTMLTags'][$lName] ); } private function maybeAnnotationOrExtensionTag( Token $t, ?bool $end, array $attribs, SourceRange $tsr ) { $tagName = mb_strtolower( $t->getName() ); $isAnnotationTag = $this->siteConfig->isAnnotationTag( $tagName ); if ( !$isAnnotationTag ) { $pipepos = strpos( $tagName, '\|' ); if ( $pipepos ) { $strBeforePipe = substr( $tagName, 0, $pipepos ); $isAnnotationTag = $this->siteConfig->isAnnotationTag( $strBeforePipe ); if ( $isAnnotationTag ) { $attribs = [ new KV( "name", substr( $tagName, $pipepos + 1, strlen( $tagName ) - $pipepos - 1 ) ) ]; $tagName = $strBeforePipe; } } } if ( $isAnnotationTag ) { $metaAttrs = [ new KV( 'typeof', 'mw:Annotation/' . $tagName . ($end ? '/End' : '') ) ]; if ( count( $attribs ) > 0 ) { $attrMap = []; foreach ( $attribs as $attr ) { // If the key or the value is not a string, we replace it by the thing that generated it and // consider that wikitext as a raw string instead. $k = is_string( $attr->k ) ? $attr->k : $attr->ksrc; $v = is_string( $attr->v ) ? $attr->v : $attr->vsrc; $attrMap[$k] = $v; } $datamw = []; // Possible follow-up in T295168 for attribute sanitation $datamw['attrs'] = $attrMap; array_push( $metaAttrs, new KV( 'data-mw', PHPUtils::jsonEncode( $datamw ) ) ); } $dp = new DataParsoid(); $dp->tsr = $tsr; $this->env->hasAnnotations = true; // FIXME: Suppress annotation meta tokens from template pipelines // since they may not have TSR values and won't get recognized as // annotation ranges. Without TSR, they might end up stuck in // fosterable positions and cause havoc on edits by breaking selser. if ( empty( $this->pipelineOpts['inTemplate'] ) ) { return [ new SelfclosingTagTk ( 'meta', $metaAttrs, $dp ) ]; } else { return []; } } $isInstalledExt = isset( $this->extTags[$tagName] ); $isIncludeTag = WTUtils::isIncludeTag( $tagName ); // Extensions have higher precedence when they shadow html tags. if ( !( $isInstalledExt \|\| $isIncludeTag ) ) { return $t; } $dp = $t->dataAttribs; $skipPos = $this->currPos; switch ( get_class( $t ) ) { case EndTagTk::class: if ( $isIncludeTag ) { return $t; } // Similar to TagTk, we rely on the sanitizer to convert to text // where necessary and emit tokens to ease the wikitext escaping // code. However, extension tags that shadow html tags will see // their unmatched end tags dropped while tree building, since // the sanitizer will let them through. return $t; // not text() case SelfclosingTagTk::class: $dp->src = $dp->tsr->substr( $this->input ); $dp->extTagOffsets = new DomSourceRange( $dp->tsr->start, $dp->tsr->end, $dp->tsr->length(), 0 ); if ( $isIncludeTag ) { return $t; } break; case TagTk::class: $endTagRE = '~.?(</' . preg_quote( $tagName, '~' ) . '\s>)~iusA'; $tagContentFound = preg_match( $endTagRE, $this->input, $tagContent, 0, $dp->tsr->start ); if ( !$tagContentFound ) { $dp->src = $dp->tsr->substr( $this->input ); $dp->extTagOffsets = new DomSourceRange( $dp->tsr->start, $dp->tsr->end, $dp->tsr->length(), 0 ); if ( $isIncludeTag ) { return $t; } else { // This is undefined behaviour. The old parser currently // returns text here (see core commit 674e8388cba), // whereas this results in unclosed // extension tags that shadow html tags falling back to // their html equivalent. The sanitizer will take care // of converting to text where necessary. We do this to // simplify `hasWikitextTokens` when escaping wikitext, // which wants these as tokens because it's otherwise // lacking in context. return $t; // not text() } } $extSrc = $tagContent[0]; $extEndOffset = $dp->tsr->start + strlen( $extSrc ); $extEndTagWidth = strlen( $tagContent[1] ); if ( !empty( $this->pipelineOpts['inTemplate'] ) ) { // Support 1-level of nesting in extensions tags while // tokenizing in templates to support the #tag parser function. // // It's necessary to permit this broadly in templates because // there's no way to distinguish whether the nesting happened // while expanding the #tag parser function, or just a general // syntax errors. In other words, // // hi<ref>ho<ref>hi</ref>ho</ref> // // and // // hi{{#tag:ref\|ho<ref>hi</ref>ho}} // // found in template are returned indistinguishably after a // preprocessing request, though the old parser renders them // differently. #tag in template is probably a common enough // use case that we want to accept these false positives, // though another approach could be to drop this code here, and // invoke a native #tag handler and forgo those in templates. // // Expand `extSrc` as long as there is a <tagName> found in the // extension source body. $startTagRE = '~<' . preg_quote( $tagName, '~' ) . '([^/>]\|/(?!>))>~i'; $s = substr( $extSrc, $dp->tsr->end - $dp->tsr->start ); while ( strlen( $s ) ) { if ( !preg_match( $startTagRE, $s ) ) { break; } if ( !preg_match( $endTagRE, $this->input, $tagContent, 0, $extEndOffset ) ) { break; } $s = $tagContent[0]; $extEndOffset += strlen( $s ); $extEndTagWidth = strlen( $tagContent[1] ); $extSrc .= $s; } } // Extension content source $dp->src = $extSrc; $dp->extTagOffsets = new DomSourceRange( $dp->tsr->start, $extEndOffset, $dp->tsr->length(), $extEndTagWidth ); $skipPos = $dp->extTagOffsets->innerEnd(); // If the xml-tag is a known installed (not native) extension, // skip the end-tag as well. if ( $isInstalledExt ) { $skipPos = $dp->extTagOffsets->end; } break; default: $this->unreachable(); } $this->currPos = $skipPos; if ( $isInstalledExt ) { // update tsr->end to span the start and end tags. $dp->tsr->end = $this->endOffset(); // was just modified above return new SelfclosingTagTk( 'extension', [ new KV( 'typeof', 'mw:Extension' ), new KV( 'name', $tagName ), new KV( 'about', $this->env->newAboutId() ), new KV( 'source', $dp->src ), new KV( 'options', $t->attribs ) ], $dp ); } elseif ( $isIncludeTag ) { // Parse ext-content, strip eof, and shift tsr $extContent = $dp->extTagOffsets->stripTags( $dp->src ); $tokenizer = new PegTokenizer( $this->env ); $tokenizer->setSourceOffsets( new SourceRange( $dp->extTagOffsets->innerStart(), $dp->extTagOffsets->innerEnd() ) ); $extContentToks = $tokenizer->tokenizeSync( $extContent ); if ( $dp->extTagOffsets->closeWidth > 0 ) { TokenUtils::stripEOFTkFromTokens( $extContentToks ); } array_unshift( $extContentToks, $t ); return $extContentToks; } else { $this->unreachable(); } } } /********************************************************* * The top-level rule ********************************************************/ start "start" = t:tlb n:newlineToken* { if ( count( $t ) ) { $ret = TokenizerUtils::flattenIfArray( $t ); } else { $ret = []; } if ( count( $n ) ) { PHPUtils::pushArray($ret, $n); } $ret[] = new EOFTk(); return $ret; } /* * Redirects can only occur as the first thing in a document. See * WikitextContent::getRedirectTarget() / redirect = rw:redirect_word sp:$space_or_newline c:$(":" space_or_newline)? wl:wikilink & { return count( $wl ) === 1 && $wl[0] instanceof Token; } { $link = $wl[0]; if ( $sp ) { $rw .= $sp; } if ( $c ) { $rw .= $c; } // Build a redirect token $dp = new DataParsoid; $dp->src = $rw; $dp->tsr = $this->tsrOffsets(); $dp->linkTk = $link; $redirect = new SelfclosingTagTk( 'mw:redirect', // Put 'href' into attributes so it gets template-expanded [ $link->getAttributeKV( 'href' ) ], $dp ); return $redirect; } // These rules are exposed as start rules. generic_newline_attributes "generic_newline_attributes" = generic_newline_attribute table_attributes "table_attributes" = (table_attribute / optionalSpaceToken b:broken_table_attribute_name_char { return $b; })* /* The 'redirect' magic word. * The leading whitespace allowed is due to the PHP trim() function. / redirect_word = $( [ \t\n\r\0\x0b] rw:$(!space_or_newline ![:\[] .)+ & { return preg_match( $this->env->getSiteConfig()->getMagicWordMatcher( 'redirect' ), $rw ); } ) /* * This rule exists to support tokenizing the document in chunks. * The parser's streaming interface will stop tokenization after each iteration * of the starred subexpression, and yield to the node.js event-loop to * schedule other pending event handlers. / start_async = ( ( & { $this->startTime = null; if ( $this->env->profiling() ) { $profile = $this->env->getCurrentProfile(); $this->startTime = microtime( true ); } return true; } t:tlb & { if ( $this->env->profiling() ) { $profile = $this->env->getCurrentProfile(); $profile->bumpTimeUse( 'PEG', 1000 ( microtime( true ) - $this->startTime ), 'PEG' ); } return true; } ) { return $t; } / newlineToken* &{ // "tlb" matches "block" matches "sol" matches "newlineToken" // But, "tlb" is prefixed with a !eof clause, so, we should only // get here on eof. So, safe to unconditionally terminate the // generator loop here. return false; } )* /* * A document (start rule) is a sequence of toplevelblocks. Tokens are * emitted in chunks per toplevelblock to avoid buffering the full document. / tlb "tlb" = !eof b:block { // Clear the tokenizer's backtracking cache after matching each // toplevelblock. There won't be any backtracking as a document is just a // sequence of toplevelblocks, so the cache for previous toplevelblocks // will never be needed. $end = $this->startOffset(); for ( ; $this->prevOffset < $end; $this->prevOffset++ ) { unset( $this->cache[$this->prevOffset] ); } $tokens = null; if ( is_array( $b ) && count( $b ) ) { $tokens = TokenizerUtils::flattenIfArray( $b ); } elseif ( is_string( $b ) ) { $tokens = [ $b ]; } // Emit tokens for this toplevelblock. This feeds a chunk to the parser pipeline. return $this->emitChunk( $tokens ); } / * The actual contents of each block. / block = // has to be first alternative; otherwise gets parsed as a <ol> &sof r:redirect cil:comment_include_annotation bl:block_line? { return array_merge( [ $r ], $cil, $bl ?: [] ); } / block_lines / & '<' rs:( c:comment &eolf { return $c; } // avoid a paragraph if we know that the line starts with a block tag / bt:block_tag ) { return $rs; } / paragraph // Inlineline includes generic tags; wrapped into paragraphs in token // transform and DOM postprocessor / inlineline / s:sol !sof !inline_breaks { return $s; } /* * A block nested in other constructs. Avoid eating end delimiters for other * constructs by checking against inline_breaks first. / nested_block = !inline_breaks b:block { return $b; } / * The same, but suitable for use inside a table construct. * Doesn't match table_heading_tag, table_row_tag, table_data_tag, * table_caption tag, or table_end_tag, although it does allow * table_start_tag (for nested tables). / nested_block_in_table = // XXX: don't rely on a lame look-ahead like this; use syntax stops // instead, so that multi-line th content followed by a line prefixed with // a comment is also handled. Alternatively, implement a sol look-behind // assertion accepting spaces and comments. !(sol (space sol)? space* (pipe / "!")) // avoid recursion via nested_block_in_table, as that can lead to stack // overflow in large tables // See https://phabricator.wikimedia.org/T59670 b:nested_block<tableDataBlock> { return $b; } /* * Line-based block constructs. / block_lines = s:sol // eat an empty line before the block s2:(os:optionalSpaceToken so:sol { return array_merge( $os, $so ); })? bl:block_line { return array_merge( $s, $s2 ?: [], is_array( $bl ) ? $bl : [ $bl ] ); } // Horizontal rules hr = "----" d:$"-" // Check if a newline or content follows lineContent:( &sol "" { return null; } / "" { return true; } ) { $dataAttribs = new DataParsoid; $dataAttribs->tsr = $this->tsrOffsets(); if ( $lineContent !== null ) { $dataAttribs->lineContent = $lineContent; } if ( strlen( $d ) > 0 ) { $dataAttribs->extra_dashes = strlen( $d ); } return new SelfclosingTagTk( 'hr', [], $dataAttribs ); } /* * Block structures with start-of-line wiki syntax / block_line = heading / list_item / hr / st:optionalSpaceToken r:( & [ <{}\|!] tl:table_line { return $tl; } // tag-only lines should not trigger pre either / bts:(bt:block_tag stl:optionalSpaceToken { return array_merge( $bt, $stl ); })+ &eolf { return $bts; } ) { return array_merge( $st, $r ); } / * A paragraph. We don't emit 'p' tokens to avoid issues with template * transclusions, <p> tags in the source and the like. Instead, we perform * some paragraph wrapping on the token stream and the DOM. / paragraph = s1:sol s2:sol c:inlineline { return array_merge( $s1, $s2, $c ); } br = s:optionalSpaceToken &newline { $dp = new DataParsoid; $dp->tsr = $this->tsrOffsets(); return array_merge( $s, [ new SelfclosingTagTk( 'br', [], $dp ) ] ); } inline_breaks = & [=\|!{}:;\r\n[\]\-] ( annOrExtTag: <annOrExtTag> h: <h> extlink: <extlink> intemplate: <intemplate> preproc: <preproc> equal: <equal> table: <table> templateArg: <templateArg> tableCellArg: <tableCellArg> semicolon: <semicolon> arrow: <arrow> linkdesc: <linkdesc> colon: <colon> th: <th> & { return TokenizerUtils::inlineBreaks( $this->input, $this->endOffset(), [ 'annOrExtTag' => $annOrExtTag, 'h' => $h, 'extlink' => $extlink, 'intemplate' => $intemplate, 'preproc' => $preproc, 'equal' => $equal, 'table' => $table, 'templateArg' => $templateArg, 'tableCellArg' => $tableCellArg, 'semicolon' => $semicolon, 'arrow' => $arrow, 'linkdesc' => $linkdesc, 'colon' => $colon, 'th' => $th ], $this->env ); } ) inlineline = c:( urltext / !inline_breaks r:(inline_element / !newline s:. { return $s; }) { return $r; } )+ { return TokenizerUtils::flattenStringlist( $c ); } inline_element = & '<' r:( xmlish_tag / comment ) { return $r; } / & '{' r:tplarg_or_template { return $r; } / & "-{" r:lang_variant_or_tpl { return $r; } // FIXME: The old parser's handleInternalLinks2 splits on [[, resulting // in sequences with odd number of brackets parsing as text, and sequences // with even number of brackets having its innermost pair parse as a // wikilink. For now, we faithfully reproduce what's found there but // wikitext, the language, shouldn't be defined by odd tokenizing behaviour // in the old parser. Flagging this for a future cleanup. / $('[[' &'[')+ / & '[' r:( wikilink / extlink ) { return $r; } / & "'" r:quote { return $r; } / Headings / heading = & "=" // guard, to make sure '='+ will match. // XXX: Also check to end to avoid inline parsing? r:( s:$'='+ // moved in here to make s accessible to inner action ce:( (ill:inlineline<h>? { return $ill ?: []; }) $'='+ )? & { return $ce \|\| strlen( $s ) > 2; } endTPos:("" { return $this->endOffset(); }) spc:( space / comment_include_annotation ) &eolf { $c = null; $e = null; $level = null; if ( $ce ) { $c = $ce[0]; $e = $ce[1]; $level = min( strlen( $s ), strlen( $e ) ); } else { // split up equal signs into two equal parts, with at least // one character in the middle. $level = (int)floor( ( strlen( $s ) - 1 ) / 2 ); $c = [ str_repeat( '=', strlen( $s ) - 2 * $level ) ]; $s = $e = str_repeat( '=', $level ); } $level = min( 6, $level ); // convert surplus equals into text if ( strlen( $s ) > $level ) { $extras1 = substr( $s, 0, strlen( $s ) - $level ); if ( is_string( $c[0] ) ) { $c[0] = $extras1 . $c[0]; } else { array_unshift( $c, $extras1 ); } } if ( strlen( $e ) > $level ) { $extras2 = substr( $e, 0, strlen( $e ) - $level ); $lastElem = PHPUtils::lastItem( $c ); if ( is_string( $lastElem ) ) { $c[count( $c ) - 1] .= $extras2; } else { $c[] = $extras2; } } $tagDP = new DataParsoid; $tagDP->tsr = $this->tsrOffsets( 'start' ); $tagDP->tsr->end += $level; // Match the old parser's behavior by (a) making headingIndex part of tokenizer // state(don't reuse pipeline!) and (b) assigning the index when // ==== is tokenized, even if we're inside a template argument // or other context which won't end up putting the heading // on the output page. T213468/T214538 $this->headingIndex++; $tagDP->getTemp()->headingIndex = $this->headingIndex; $res = [ new TagTk( 'h' . $level, [], $tagDP ) ]; PHPUtils::pushArray( $res, $c ); $endTagDP = new DataParsoid; $endTagDP->tsr = new SourceRange( $endTPos - $level, $endTPos ); $res[] = new EndTagTk( 'h' . $level, [], $endTagDP ); $res[] = $spc; return $res; } ) { return $r; } / Comments / // The old parser does a straight str.replace(/<!--((?!-->).)-->/g, "") // but, as always, things around here are a little more complicated. // // We accept the same comments, but because we emit them as HTML comments // instead of deleting them, we have to encode the data to ensure that // we always emit a valid HTML5 comment. See the encodeComment helper // for further details. comment = '<!--' c:$(!"-->" .)* ('-->' / eof) { // WARNING(T279451): This encoding is important for the choice of key // in WTUtils::fosterCommentData $data = WTUtils::encodeComment( $c ); $dp = new DataParsoid; $dp->tsr = $this->tsrOffsets(); return [ new CommentTk( $data, $dp ) ]; } // Behavior switches. See: // https://www.mediawiki.org/wiki/Help:Magic_words#Behavior_switches behavior_switch = bs:$('__' behavior_text '__') { if ( $this->siteConfig->isMagicWord( $bs ) ) { $dp = new DataParsoid; $dp->tsr = $this->tsrOffsets(); $dp->src = $bs; $dp->magicSrc = $bs; return [ new SelfclosingTagTk( 'behavior-switch', [ new KV( 'word', $bs ) ], $dp ) ]; } else { return [ $bs ]; } } // Instead of defining a charset, the old parser's doDoubleUnderscore concats a // regexp of all the language specific aliases of the behavior switches and // then does a match and replace. Just be as permissive as possible and let the // BehaviorSwitchPreprocessor back out of any overreach. behavior_text = $( !'__' ( text_char / "-" ) )+ /************************************************************** * External (bracketed and autolinked) links *************************************************************/ autolink = ! <extlink> // this must be a word boundary, so previous character must be non-word ! { return Utils::isUniWord(Utils::lastUniChar( $this->input, $this->endOffset() ) ); } r:( autourl / autoref / isbn ) { return $r; } extlink "extlink" = ! <extlink> // extlink cannot be nested r:( "[" p0:( "" { return $this->endOffset(); }) addr:(url_protocol ipv6urladdr / "") target:(extlink_nonipv6url<extlink> / "") p1:( "" { return $this->endOffset(); }) & { // Protocol must be valid and there ought to be at least one // post-protocol character. So strip last char off target // before testing protocol. $flat = TokenizerUtils::flattenString( [ $addr, $target ] ); if ( is_array( $flat ) ) { // There are templates present, alas. return count( $flat ) > 0; } return Utils::isProtocolValid( substr( $flat, 0, -1 ), $this->env ); } sp:$( space / unispace ) p2:( "" { return $this->endOffset(); }) content:inlineline<extlink>? p3:( "" { return $this->endOffset(); }) "]" { $tsr1 = new SourceRange( $p0, $p1 ); $tsr2 = new SourceRange( $p2, $p3 ); $dp = new DataParsoid; $dp->tsr = $this->tsrOffsets(); $dp->extLinkContentOffsets = $tsr2; return [ new SelfclosingTagTk( 'extlink', [ new KV( 'href', TokenizerUtils::flattenString( [ $addr, $target ] ), $tsr1->expandTsrV() ), new KV( 'mw:content', $content ?? '', $tsr2->expandTsrV() ), new KV( 'spaces', $sp ) ], $dp ) ]; } ) { return $r; } autoref = ref:('RFC' / 'PMID') sp:space_or_nbsp+ identifier:$[0-9]+ end_of_word { $base_urls = [ 'RFC' => 'https://tools.ietf.org/html/rfc%s', 'PMID' => '//www.ncbi.nlm.nih.gov/pubmed/%s?dopt=Abstract' ]; $tsr = $this->tsrOffsets(); $dp = new DataParsoid; $dp->tsr = $tsr; $dp->stx = 'magiclink'; return [ new SelfclosingTagTk( 'extlink', [ new KV( 'href', sprintf( $base_urls[ $ref ], $identifier ) ), new KV( 'mw:content', TokenizerUtils::flattenString( [ $ref, $sp, $identifier ] ), $tsr->expandTsrV() ), new KV( 'typeof', 'mw:ExtLink/' . $ref ) ], $dp ) ]; } isbn = 'ISBN' sp:space_or_nbsp+ isbn:( [0-9] ((space_or_nbsp_or_dash / "") [0-9])+ ((space_or_nbsp_or_dash / "") [xX] / "") ) isbncode:( end_of_word { // Convert isbn token-and-entity array to stripped string. $stripped = ''; foreach ( TokenizerUtils::flattenStringlist( $isbn ) as $part ) { if ( is_string( $part ) ) { $stripped .= $part; } } return strtoupper( preg_replace( '/[^\dX]/i', '', $stripped ) ); } ) &{ // ISBNs can only be 10 or 13 digits long (with a specific format) return strlen( $isbncode ) === 10 \|\| ( strlen( $isbncode ) === 13 && preg_match( '/^97[89]/', $isbncode ) ); } { $tsr = $this->tsrOffsets(); $dp = new DataParsoid; $dp->stx = 'magiclink'; $dp->tsr = $tsr; return [ new SelfclosingTagTk( 'extlink', [ new KV( 'href', 'Special:BookSources/' . $isbncode ), new KV( 'mw:content', TokenizerUtils::flattenString( [ 'ISBN', $sp, $isbn ] ), $tsr->expandTsrV() ), new KV( 'typeof', 'mw:WikiLink/ISBN' ) ], $dp ) ]; } /* Default URL protocols in MediaWiki (see DefaultSettings). Normally * these can be configured dynamically. / url_protocol = p:$( '//' / [A-Za-z] [-A-Za-z0-9+.] ':' '//'? ) & { return Utils::isProtocolValid( $p, $this->env ); } { return $p; } // no punctuation, and '{<' to trigger directives no_punctuation_char = [^ \]\[\r\n"'<>\x00-\x20\x7f&\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000{] // this is the general url rule // on the PHP side, the path part matches EXT_LINK_URL_CLASS // which is '[^][<>"\\x00-\\x20\\x7F\p{Zs}]' url = proto:url_protocol addr:(ipv6urladdr / "") path:( !inline_breaks c:( no_punctuation_char / comment / tplarg_or_template / ['{] / ! ( "&" ( [lL][tT] / [gG][tT] ) ";" ) r:( & "&" he:htmlentity { return $he; } / "&" ) { return $r; } ) { return $c; } )* // Must be at least one character after the protocol & { return $addr !== '' \|\| count( $path ) > 0; } { return TokenizerUtils::flattenString( array_merge( [ $proto, $addr ], $path ) ); } // this is the somewhat-restricted rule used in autolinks // See Parser::doMagicLinks and Parser.php::makeFreeExternalLink. // The `path` portion matches EXT_LINK_URL_CLASS, as in the general // url rule. As in PHP, we do some fancy fixup to yank out // trailing punctuation, perhaps including parentheses. autourl = ! '//' // protocol-relative autolinks not allowed (T32269) r:( proto:url_protocol addr:(ipv6urladdr / "") path:( !inline_breaks c:( no_punctuation_char / comment / tplarg_or_template / $("'" !"'") // single quotes are ok, double quotes are bad / "{" / ! ( rhe:raw_htmlentity &{ return $rhe === '<' \|\| $rhe === '>' \|\| $rhe === "\u{A0}"; } ) r:( & "&" he:htmlentity { return $he; } / "&" ) { return $r; } ) { return $c; } )* { // as in Parser.php::makeFreeExternalLink, we're going to // yank trailing punctuation out of this match. $url = TokenizerUtils::flattenStringlist( array_merge( [ $proto, $addr ], $path ) ); // only need to look at last element; HTML entities are strip-proof. $last = PHPUtils::lastItem( $url ); $trim = 0; if ( is_string( $last ) ) { $strip = TokenizerUtils::getAutoUrlTerminatingChars( in_array( '(', $path, true ) ); $trim = strspn( strrev( $last ), $strip ); $url[ count( $url ) - 1 ] = substr( $last, 0, strlen( $last ) - $trim ); } $url = TokenizerUtils::flattenStringlist( $url ); if ( count( $url ) === 1 && is_string( $url[0] ) && strlen( $url[0] ) <= strlen( $proto ) ) { return null; // ensure we haven't stripped everything: T106945 } $this->currPos -= $trim; return $url; } ) &{ return $r !== null; } { $tsr = $this->tsrOffsets(); $dp = new DataParsoid; $dp->tsr = $tsr; $res = [ new SelfclosingTagTk( 'urllink', [ new KV( 'href', $r, $tsr->expandTsrV() ) ], $dp ) ]; return $res; } // This is extracted from EXT_LINK_ADDR in Parser.php: a simplified // expression to match an IPv6 address. The IPv4 address and "at least // one character of a host name" portions are punted to the `path` // component of the `autourl` and `url` productions ipv6urladdr = $( "[" [0-9A-Fa-f:.]+ "]" ) /************************************************************** * Templates, -arguments and wikilinks *************************************************************/ / * Precedence: template arguments win over templates. See * http://www.mediawiki.org/wiki/Preprocessor_ABNF#Ideal_precedence * 4: {{{{·}}}} → {·{{{·}}}·} * 5: {{{{{·}}}}} → {{·{{{·}}}·}} * 6: {{{{{{·}}}}}} → {{{·{{{·}}}·}}} * 7: {{{{{{{·}}}}}}} → {·{{{·{{{·}}}·}}}·} * This is only if close has > 3 braces; otherwise we just match open * and close as we find them. / tplarg_or_template = &'{{' t:tplarg_or_template_guarded<intemplate=true> { return $t; } tplarg_or_template_guarded = &('{{' &('{{{'+ !'{') tplarg) a:(template/broken_template) { return $a; } / a:$('{' &('{{{'+ !'{'))? b:tplarg { return [ $a, $b ]; } / a:$('{' &('{{' !'{'))? b:template { return [ $a, $b ]; } / broken_template tplarg_or_template_or_bust = r:(tplarg_or_template / .)+ { return TokenizerUtils::flattenIfArray( $r ); } template = template_preproc<&preproc="}}"> // The old preprocessor maintains a single stack of "closing token we // are currently looking for", with no backtracking. This means that // once you see `[[ {{` you are looking only for `}}` -- if that template // turns out to be broken you will never pop the `}}` and there is no way // to close the `[[`. Since the PEG tokenizer in Parsoid uses backtracking // and parses in a single pass (instead of PHP's split preprocessor/parser) // we have to be a little more careful when we emulate this behavior. // If we use a rule like: // template = "{{" tplname tplargs "}}"? // Then we end up having to reinterpret `tplname tplargs` as a tlb if it // turns out we never find the `}}`, which involves a lot of tedious gluing // tokens back together with fingers crossed we haven't discarded any // significant newlines/whitespace/etc. An alternative would be a rule like: // broken_template = "{{" tlb // but again, `template` is used in many different contexts; `tlb` isn't // necessarily the right one to recursively invoke. Instead we get the // broken template off of the PEGjs production stack by returning immediately // after `{{`, but we set the "preproc" reference parameter to false (the // reference parameter feature having been introduced for this sole purpose) // to indicate to the parent rule that we're "still in" the {{ context and // shouldn't ever inlineBreak for any closing tokens above this one. For // example: // [[Foo{{Bar]] // This will match as: // wikilink->text,template->text --> FAILS looking for }} // backtracks, popping "bracket_bracket" and "brace_brace" off preproc stack // wikilink->text,broken_template,text --> FAILS looking for ]] // backtracks, popping "bracket_bracket" and false off preproc stack // broken_wikilink,text,broken_template,text --> OK // with [false, false] left on the preproc stack broken_template = preproc:<&preproc> t:"{{" { $preproc = null; return $t; } template_preproc = "{{" leadWS:$( nl_comment_space ) target:template_param_value params:( nl_comment_space* "\|" r:( p0:("" { return $this->endOffset(); }) v:nl_comment_space* p:("" { return $this->endOffset(); }) &("\|" / "}}") { // empty argument $tsr0 = new SourceRange( $p0, $p ); return new KV( '', TokenizerUtils::flattenIfArray( $v ), $tsr0->expandTsrV() ); } / template_param ) { return $r; } )* trailWS:$( nl_comment_space* ) inline_breaks "}}" { // Insert target as first positional attribute, so that it can be // generically expanded. The TemplateHandler then needs to shift it out // again. array_unshift( $params, new KV( TokenizerUtils::flattenIfArray( $target['tokens'] ), '', $target['srcOffsets']->expandTsrK() ) ); $dp = new DataParsoid; $dp->tsr = $this->tsrOffsets(); $dp->src = $this->text(); $tmp = $dp->getTemp(); $tmp->leadWS = $leadWS; $tmp->trailWS = $trailWS; $obj = new SelfclosingTagTk( 'template', $params, $dp ); return $obj; } / $('{{' space_or_newline* '}}') tplarg = tplarg_preproc<&preproc="}}"> tplarg_preproc = "{{{" p:("" { return $this->endOffset(); }) target:template_param_value? params:( nl_comment_space* "\|" r:( p0:("" { return $this->endOffset(); }) v:nl_comment_space* p1:("" { return $this->endOffset(); }) &("\|" / "}}}") { // empty argument return [ 'tokens' => $v, 'srcOffsets' => new SourceRange( $p0, $p1 ) ]; } / template_param_value ) { return $r; } )* nl_comment_space* inline_breaks "}}}" { $kvs = []; if ( $target === null ) { $target = [ 'tokens' => '', 'srcOffsets' => new SourceRange( $p, $p ) ]; } // Insert target as first positional attribute, so that it can be // generically expanded. The TemplateHandler then needs to shift it out // again. $kvs[] = new KV( TokenizerUtils::flattenIfArray( $target['tokens'] ), '', $target['srcOffsets']->expandTsrK() ); foreach ( $params as $o ) { $s = $o['srcOffsets']; $kvs[] = new KV( '', TokenizerUtils::flattenIfArray( $o['tokens'] ), $s->expandTsrV() ); } $dp = new DataParsoid; $dp->tsr = $this->tsrOffsets(); $dp->src = $this->text(); $obj = new SelfclosingTagTk( 'templatearg', $kvs, $dp ); return $obj; } template_param = name:template_param_name val:( kEndPos:("" { return $this->endOffset(); }) // no optionalSpaceToken here, it's eaten by template_param_name "=" vStartPos:("" { return $this->endOffset(); }) optSp:optionalSpaceToken tpv:template_param_value? { return [ 'kEndPos' => $kEndPos, 'vStartPos' => $vStartPos, 'value' => TokenizerUtils::flattenString( [ $optSp, $tpv['tokens'] ?? [] ] ), ]; } )? { if ( $val !== null ) { if ( $val['value'] !== null ) { $so = new KVSourceRange( $this->startOffset(), $val['kEndPos'], $val['vStartPos'], $this->endOffset() ); return new KV( $name, TokenizerUtils::flattenIfArray( $val['value'] ), $so ); } else { return new KV( TokenizerUtils::flattenIfArray( $name ), '', $so ); } } else { $so = new SourceRange( $this->startOffset(), $this->endOffset() ); return new KV( '', TokenizerUtils::flattenIfArray( $name ), $so->expandTsrV() ); } } // empty parameter / & [\|}] { $so = new SourceRange( $this->startOffset(), $this->endOffset() ); return new KV( '', '', $so->expandTsrV() ); } template_param_name = template_param_text<equal> / (&'=' { return ''; }) template_param_value = tpt:template_param_text<equal=false> { return [ 'tokens' => $tpt, 'srcOffsets' => $this->tsrOffsets() ]; } template_param_text = il:(nested_block<table=false, extlink=false, templateArg=true, tableCellArg=false> / newlineToken)+ { // il is guaranteed to be an array -- so, tu.flattenIfArray will // always return an array $r = TokenizerUtils::flattenIfArray( $il ); if ( count( $r ) === 1 && is_string( $r[0] ) ) { $r = $r[0]; } return $r; } //// Language converter block markup of language variants: -{ ... }- // Note that "rightmost opening" precedence rule (see // https://www.mediawiki.org/wiki/Preprocessor_ABNF ) means // that neither -{{ nor -{{{ are parsed as a -{ token, although // -{{{{ is (since {{{ has precedence over {{). lang_variant_or_tpl = &('-{' &('{{{'+ !'{') tplarg) a:lang_variant { return $a; } / a:$('-' &('{{{'+ !'{')) b:tplarg { return [ $a, $b ]; } / a:$('-' &('{{' '{{{'* !'{')) b:template { return [ $a, $b ]; } / &'-{' a:lang_variant { return $a; } broken_lang_variant = r:"-{" preproc:<&preproc> { $preproc = null; return $r; } lang_variant = // FIXME: Maybe this should suppress "table" and "tableCellArg" like 'template_param_text' d too lang_variant_preproc<&preproc="}-", extlink=false> / broken_lang_variant lang_variant_preproc = lv0:("-{" { return $this->startOffset(); }) f:( &{ return $this->env->langConverterEnabled(); } ff:opt_lang_variant_flags { // if flags contains 'R', then don't treat ; or : specially inside. if ( isset( $ff['flags'] ) ) { $ff['raw'] = isset( $ff['flags']['R'] ) \|\| isset( $ff['flags']['N'] ); } elseif ( isset( $ff['variants'] ) ) { $ff['raw'] = true; } return $ff; } / &{ return !$this->env->langConverterEnabled(); } "" { // if language converter not enabled, don't try to parse inside. return [ 'raw' => true ]; } ) ts:( &{ return $f['raw']; } lv:lang_variant_text { return [ [ 'text' => $lv ] ]; } / &{ return !$f['raw']; } lv:lang_variant_option_list { return $lv; } ) inline_breaks lv1:("}-" { return $this->endOffset(); }) { if ( !$this->env->langConverterEnabled() ) { return [ '-{', $ts[0]['text']['tokens'], '}-' ]; } $lvsrc = substr( $this->input, $lv0, $lv1 - $lv0 ); $attribs = []; foreach ( $ts as &$t ) { // move token strings into KV attributes so that they are // properly expanded by early stages of the token pipeline foreach ( [ 'text', 'from', 'to' ] as $fld ) { if ( !isset( $t[$fld] ) ) { continue; } $name = 'mw:lv' . count( $attribs ); // Note that AttributeExpander will expect the tokens array to be // flattened. We do that in lang_variant_text / lang_variant_nowiki $attribs[] = new KV( $name, $t[$fld]['tokens'], $t[$fld]['srcOffsets']->expandTsrV() ); $t[$fld] = $name; } } unset( $t ); $flags = isset( $f['flags'] ) ? array_keys( $f['flags'] ) : []; sort( $flags ); $variants = isset( $f['variants'] ) ? array_keys( $f['variants'] ) : []; sort( $variants ); $dp = new DataParsoid; $dp->tsr = new SourceRange( $lv0, $lv1 ); $dp->src = $lvsrc; $dp->flags = $flags; $dp->variants = $variants; $dp->original = $f['original']; $dp->flagSp = $f['sp']; $dp->texts = $ts; return [ new SelfclosingTagTk( 'language-variant', $attribs, $dp ) ]; } opt_lang_variant_flags = f:( ff:lang_variant_flags "\|" { return $ff; } )? { // Collect & separate flags and variants into a hashtable (by key) and ordered list $flags = []; $variants = []; $flagList = []; $flagSpace = []; $variantList = []; $variantSpace = []; $useVariants = false; if ( $f !== null ) { // lang_variant_flags returns arrays in reverse order. $spPtr = count( $f['sp'] ) - 1; for ( $i = count( $f['flags'] ) - 1; $i >= 0; $i--) { $item = $f['flags'][$i]; if ( isset( $item['flag'] ) ) { $flagSpace[] = $f['sp'][$spPtr--]; $flags[$item['flag']] = true; $flagList[] = $item['flag']; $flagSpace[] = $f['sp'][$spPtr--]; } if ( isset( $item['variant'] ) ) { $variantSpace[] = $f['sp'][$spPtr--]; $variants[$item['variant']] = true; $variantList[] = $item['variant']; $variantSpace[] = $f['sp'][$spPtr--]; } } if ( $spPtr >= 0 ) { // handle space after a trailing semicolon $flagSpace[] = $f['sp'][$spPtr]; $variantSpace[] = $f['sp'][$spPtr]; } } // Parse flags (this logic is from core/languages/ConverterRule.php // in the parseFlags() function) if ( count( $flags ) === 0 && count( $variants ) === 0 ) { $flags['$S'] = true; } elseif ( isset( $flags['R'] ) ) { $flags = [ 'R' => true ]; // remove other flags } elseif ( isset( $flags['N'] ) ) { $flags = [ 'N' => true ]; // remove other flags } elseif ( isset( $flags['-'] ) ) { $flags = [ '-' => true ]; // remove other flags } elseif ( isset( $flags['T'] ) && count( $flags ) === 1 ) { $flags['H'] = true; } elseif ( isset( $flags['H'] ) ) { // Replace A flag, and remove other flags except T and D $nf = [ '$+' => true, 'H' => true ]; if ( isset( $flags['T'] ) ) { $nf['T'] = true; } if ( isset( $flags['D'] ) ) { $nf['D'] = true; } $flags = $nf; } elseif ( count( $variants ) > 0 ) { $useVariants = true; } else { if ( isset( $flags['A'] ) ) { $flags['$+'] = true; $flags['$S'] = true; } if ( isset( $flags['D'] ) ) { unset( $flags['$S'] ); } } if ( $useVariants ) { return [ 'variants' => $variants, 'original' => $variantList, 'sp' => $variantSpace ]; } else { return [ 'flags' => $flags, 'original' => $flagList, 'sp' => $flagSpace ]; } } lang_variant_flags = sp1:$(space_or_newline) f:lang_variant_flag sp2:$(space_or_newline) more:( ";" lang_variant_flags? )? { $r = ( $more && $more[1] ) ? $more[1] : [ 'sp' => [], 'flags' => [] ]; // Note that sp and flags are in reverse order, since we're using // right recursion and want to push instead of unshift. $r['sp'][] = $sp2; $r['sp'][] = $sp1; $r['flags'][] = $f; return $r; } / sp:$(space_or_newline) { return [ 'sp' => [ $sp ], 'flags' => [] ]; } lang_variant_flag = f:[-+A-Z] { return [ 'flag' => $f ]; } / v:lang_variant_name { return [ 'variant' => $v ]; } / b:$(!space_or_newline !nowiki [^{}\|;])+ { return [ 'bogus' => $b ]; / bad flag /} // language variant name, like zh, zh-cn, etc. lang_variant_name = $([a-z] [-a-zA-Z]+) // Escaped otherwise-unrepresentable language names // Primarily for supporting html2html round trips; PHP doesn't support // using nowikis here (yet!) / nowiki_text lang_variant_option_list = o:lang_variant_option rest:( ";" oo:lang_variant_option { return $oo; }) tr:( ";" $bogus_lang_variant_option )* // optional trailing crap { array_unshift( $rest, $o ); // if the last bogus option is just spaces, keep them; otherwise // drop all this bogus stuff on the ground if ( count($tr) > 0 ) { $last = $tr[count($tr)-1]; if (preg_match('/^\s$/Du', $last[1])) { $rest[] = [ 'semi' => true, 'sp' => $last[1] ]; } } return $rest; } / lvtext:lang_variant_text { return [ [ 'text' => $lvtext ] ]; } bogus_lang_variant_option = lang_variant_text? lang_variant_option = sp1:$(space_or_newline) lang:lang_variant_name sp2:$(space_or_newline) ":" sp3:$(space_or_newline) lvtext:(lang_variant_nowiki / lang_variant_text_no_semi) { return [ 'twoway' => true, 'lang' => $lang, 'text' => $lvtext, 'sp' => [ $sp1, $sp2, $sp3 ] ]; } / sp1:$(space_or_newline) from:(lang_variant_nowiki / lang_variant_text_no_semi_or_arrow) "=>" sp2:$(space_or_newline) lang:lang_variant_name sp3:$(space_or_newline) ":" sp4:$(space_or_newline) to:(lang_variant_nowiki / lang_variant_text_no_semi) { return [ 'oneway' => true, 'from' => $from, 'lang' => $lang, 'to' => $to, 'sp' => [ $sp1, $sp2, $sp3, $sp4 ] ]; } // html2wt support: If a language name or conversion string can't be // represented w/o breaking wikitext, just wrap it in a <nowiki>. // PHP doesn't support this (yet), but Parsoid does. lang_variant_nowiki = n:nowiki_text sp:$space_or_newline* { $tsr = $this->tsrOffsets(); $tsr->end -= strlen( $sp ); return [ 'tokens' => [ $n ], 'srcOffsets' => $tsr, ]; } lang_variant_text = tokens:(inlineline / "\|" )* { return [ 'tokens' => TokenizerUtils::flattenStringlist( $tokens ), 'srcOffsets' => $this->tsrOffsets(), ]; } lang_variant_text_no_semi = lang_variant_text<semicolon> lang_variant_text_no_semi_or_arrow = lang_variant_text_no_semi<arrow> wikilink_content = ( pipe startPos:("" { return $this->endOffset(); }) lt:link_text? { $tsr = new SourceRange( $startPos, $this->endOffset() ); $maybeContent = new KV( 'mw:maybeContent', $lt ?? [], $tsr->expandTsrV() ); $maybeContent->vsrc = substr( $this->input, $startPos, $this->endOffset() - $startPos ); return $maybeContent; } )* wikilink = wikilink_preproc<&preproc="]]"> / broken_wikilink // `broken-link` (see [[:mw:Preprocessor_ABNF]]), but careful because the // second bracket could start an extlink. Set preproc to false as a reference // parameter in the parent since we haven't seen a double-close bracket. // (See full explanation above broken_template production.) broken_wikilink = &"[[" preproc:<&preproc> &{ $preproc = null; return true; } a:("[" (extlink / "[")) { return $a; } wikilink_preproc = "[[" spos:("" { return $this->endOffset(); }) target:wikilink_preprocessor_text? tpos:("" { return $this->endOffset(); }) lcs:wikilink_content inline_breaks "]]" { $pipeTrick = count( $lcs ) === 1 && count( $lcs[0]->v ) === 0; $textTokens = []; if ( $target === null \|\| $pipeTrick ) { $textTokens[] = '[['; if ( $target ) { $textTokens[] = $target; } foreach ( $lcs as $a ) { // a is a mw:maybeContent attribute $textTokens[] = '\|'; if ( count( $a->v ) > 0 ) { $textTokens[] = $a->v; } } $textTokens[] = ']]'; return $textTokens; } $obj = new SelfclosingTagTk( 'wikilink' ); $tsr = new SourceRange( $spos, $tpos ); $hrefKV = new KV( 'href', $target, $tsr->expandTsrV() ); $hrefKV->vsrc = $tsr->substr( $this->input ); // XXX: Point to object with path, revision and input information // obj.source = input; $obj->attribs[] = $hrefKV; $obj->attribs = array_merge( $obj->attribs, $lcs ); $dp = new DataParsoid; $dp->tsr = $this->tsrOffsets(); $dp->src = $this->text(); $obj->dataAttribs = $dp; return [ $obj ]; } // Tables are allowed inside image captions. // Suppress the equal flag temporarily in this rule to consume the '=' here. link_text = link_text_parameterized<equal = false, linkdesc = true> link_text_parameterized = c:( // This group is similar to "block_line" but "list_item" // is omitted since `doBlockLevels` happens after // `handleInternalLinks2`, where newlines are stripped. (sol (heading / hr / full_table_in_link_caption)) / urltext / ( !inline_breaks r:( inline_element / '[' text_char+ ']' $(&(!']' / ']]')) / . ) { return $r; } ) )+ { return TokenizerUtils::flattenStringlist( $c ); } /* Generic quote rule for italic and bold, further processed in a token * stream transformation in doQuotes. Relies on NlTk tokens being emitted * for each line of text to balance quotes per line. * * We are not using a simple pair rule here as we need to support mis-nested * bolds/italics and MediaWiki's special heuristics for apostrophes, which are * all not context free. / quote = quotes:$("''" "'") { // sequences of four or more than five quotes are assumed to start // with some number of plain-text apostrophes. $plainticks = 0; $result = []; if ( strlen( $quotes ) === 4 ) { $plainticks = 1; } elseif ( strlen( $quotes ) > 5 ) { $plainticks = strlen( $quotes ) - 5; } if ( $plainticks > 0 ) { $result[] = substr( $quotes, 0, $plainticks ); } // mw-quote token will be consumed in token transforms $tsr = $this->tsrOffsets(); $tsr->start += $plainticks; $dp = new DataParsoid; $dp->tsr = $tsr; $mwq = new SelfclosingTagTk( 'mw-quote', [ new KV( 'value', substr( $quotes, $plainticks ) ) ], $dp ); if ( strlen( $quotes ) > 2 ) { $mwq->addAttribute( 'isSpace_1', $tsr->start > 0 && substr( $this->input, $tsr->start - 1, 1 ) === ' '); $mwq->addAttribute( 'isSpace_2', $tsr->start > 1 && substr( $this->input, $tsr->start - 2, 1 ) === ' '); } $result[] = $mwq; return $result; } /*********************************************************** * Pre and xmlish tags **********************************************************/ // FIXME: Temporary (?) hack to let us not horribly break on old tvar syntax // In coordination with language team, get rid of this hack once all old uses // are migrated to new syntax (T274881). tvar_old_syntax_closing_HACK = "/>" & { return $this->env->hasAnnotations && $this->siteConfig->isAnnotationTag( 'tvar' ); } { $metaAttrs = [ new KV( 'typeof', 'mw:Annotation/tvar/End' ) ]; $dp = new DataParsoid(); $dp->tsr = $this->tsrOffsets(); $dp->tsr->start--; // For "<" matched at the start of xmlish_tag rule return [ new SelfclosingTagTk ( 'meta', $metaAttrs, $dp ) ]; } annotation_tag = annToken:extension_annotation_tag &{ return ( $annToken instanceof Token && $annToken->getName() !== 'extension' ); } { return $annToken; } extension_annotation_tag = !<annOrExtTag> extToken:xmlish_tag // Account for `maybeAnnotationOrExtensionTag` returning unmatched start / end tags &{ return !$extToken \|\| $extToken[0]->getName() === 'extension' \|\| ($extToken[0]->getName() === 'meta' && preg_match( WTUtils::ANNOTATION_META_TYPE_REGEXP, $extToken[0]->getAttribute( 'typeof' ) ) > 0); } { return !$extToken ? '' : $extToken[0]; } nowiki = extToken:extension_annotation_tag &{ return $extToken->getAttribute( 'name' ) === 'nowiki'; } { return $extToken; } // Used by lang_variant productions to protect special language names or // conversion strings. nowiki_text = extToken:nowiki { $txt = Utils::extractExtBody( $extToken ); return Utils::decodeWtEntities( $txt ); } / Generic XML-like tags * * These also cover extensions (including Cite), which will hook into the * token stream for further processing. The content of extension tags is * parsed as regular inline, but the source positions of the tag are added * to allow reconstructing the unparsed text from the input. / // See http://www.w3.org/TR/html5/syntax.html#tag-open-state and the following // paragraphs. Note that we don't enforce ascii alpha for the first character // here because we need to be more permissive for extension tag names. That // happens in xmlish_tag_opened below. tag_name = $[^\t\n\v />\0]+ // This rule is used in carefully crafted places of xmlish tag tokenizing with // the inclusion of solidus to match where the spec would ignore those // characters. In particular, it does not belong in between attribute name // and value. space_or_newline_or_solidus = space_or_newline / (s:"/" !">" { return $s; }) xmlish_tag = "<" tag:(xmlish_tag_opened<isBlock=false, annOrExtTag> / xmlish_tag_opened<isBlock=false, annOrExtTag=false> / tvar_old_syntax_closing_HACK ) { return $tag; } xmlish_tag_opened = end:"/"? name: tag_name annOrExtTag: <annOrExtTag> isBlock: <isBlock> & { if ( $annOrExtTag ) { return WTUtils::isAnnOrExtTag( $this->env, $name ); } else { // Only enforce ascii alpha first char for non-extension tags. // See tag_name above for the details. return preg_match( '/^[A-Za-z]/', $name ) && $this->isXMLTag( $name, $isBlock ); } } // By the time we get to `doTableStuff` in the old parser, we've already // safely encoded element attributes. See 55313f4e in core. attribs:generic_newline_attributes<table=false, tableCellArg=false> space_or_newline_or_solidus // No need to preserve this -- canonicalize on RT via dirty diff selfclose:"/"? space* // not preserved - canonicalized on RT via dirty diff ">" { $lcName = mb_strtolower( $name ); // Extension tags don't necessarily have the same semantics as html tags, // so don't treat them as void elements. $isVoidElt = Utils::isVoidElement( $lcName ) && !$annOrExtTag; // Support </br> if ( $lcName === 'br' && $end ) { $end = null; } $tsr = $this->tsrOffsets(); $tsr->start--; // For "<" matched at the start of xmlish_tag rule $res = TokenizerUtils::buildXMLTag( $name, $lcName, $attribs, $end, !!$selfclose \|\| $isVoidElt, $tsr ); // change up data-attribs in one scenario // void-elts that aren't self-closed ==> useful for accurate RT-ing if ( !$selfclose && $isVoidElt ) { unset( $res->dataAttribs->selfClose ); $res->dataAttribs->noClose = true; } $met = $this->maybeAnnotationOrExtensionTag( $res, $end, $attribs, $tsr ); return is_array( $met ) ? $met : [ $met ]; } /* * A variant of xmlish_tag, but also checks if the tag name is a block-level * tag as defined in * http://www.w3.org/TR/html5/syntax.html#tag-open-state and * following paragraphs. / block_tag = "<" tag:(xmlish_tag_opened<isBlock, annOrExtTag> / xmlish_tag_opened<isBlock, annOrExtTag=false>) { return $tag; } // A generic attribute that can span multiple lines. generic_newline_attribute = space_or_newline_or_solidus namePos0:("" { return $this->endOffset(); }) name:generic_attribute_name namePos:("" { return $this->endOffset(); }) vd:(space_or_newline* "=" v:generic_att_value? { return $v; })? { // NB: Keep in sync w/ table_attibute $res = null; // Encapsulate protected attributes. if ( is_string( $name ) ) { $name = TokenizerUtils::protectAttrs( $name ); } $nameSO = new SourceRange( $namePos0, $namePos ); if ( $vd !== null ) { $res = new KV( $name, $vd['value'], $nameSO->join( $vd['srcOffsets'] ) ); $res->vsrc = $vd['srcOffsets']->substr( $this->input ); } else { $res = new KV( $name, '', $nameSO->expandTsrK() ); } if ( is_array( $name ) ) { $res->ksrc = $nameSO->substr( $this->input ); } return $res; } // A single-line attribute. table_attribute = s:optionalSpaceToken namePos0:("" { return $this->endOffset(); }) name:table_attribute_name namePos:("" { return $this->endOffset(); }) vd:(optionalSpaceToken "=" v:table_att_value? { return $v; })? { // NB: Keep in sync w/ generic_newline_attribute $res = null; // Encapsulate protected attributes. if ( gettype( $name ) === 'string' ) { $name = TokenizerUtils::protectAttrs( $name ); } $nameSO = new SourceRange( $namePos0, $namePos ); if ( $vd !== null ) { $res = new KV( $name, $vd['value'], $nameSO->join( $vd['srcOffsets'] ) ); $res->vsrc = $vd['srcOffsets']->substr( $this->input ); } else { $res = new KV( $name, '', $nameSO->expandTsrK() ); } if ( is_array( $name ) ) { $res->ksrc = $nameSO->substr( $this->input ); } return $res; } // The old parser's Sanitizer::removeHTMLtags explodes on < so that it can't // be found anywhere in xmlish tags. This is a divergence from html5 tokenizing // which happily permits it in attribute positions. Extension tags being the // exception, since they're stripped beforehand. less_than = $( &<annOrExtTag> "<" ) // The arrangement of chars is to emphasize the split between what's disallowed // by html5 and what's necessary to give directive a chance. // See: http://www.w3.org/TR/html5/syntax.html#attributes-0 generic_attribute_name = q:$(["'=]?) // From #before-attribute-name-state, < is omitted for directive r:( $[^ \t\r\n\0/=><&{}\-!\|]+ / !inline_breaks // \0/=> is the html5 attribute name set we do not want. t:( directive / less_than / $( !( space_or_newline / [\0/=><] ) . ) ) { return $t; } )* & { return count( $r ) > 0 \|\| $q !== ''; } { array_unshift( $r, $q ); return TokenizerUtils::flattenString( $r ); } // Also accept these chars in a wikitext table or tr attribute name position. // They are normally not matched by the table_attribute_name. broken_table_attribute_name_char = c:[\0/=>] { return new KV( $c, '' ); } // Same as generic_attribute_name, except for accepting tags and wikilinks. // (That doesn't make sense (ie. match the old parser) in the generic case.) // We also give a chance to break on \[ (see T2553). table_attribute_name = q:$(["'=]?) // From #before-attribute-name-state, < is omitted for directive r:( $[^ \t\r\n\0/=><&{}\-!\|\[]+ / !inline_breaks // \0/=> is the html5 attribute name set we do not want. t:( $wikilink / directive // Accept tags-inside-attributes as attribute names. // The sanitizer will strip and shadow them for roundtripping. // Example: <hiddentext>generated with.. </hiddentext> / &xmlish_tag ill:inlineline { return $ill; } / $( !( space_or_newline / [\0/=>] ) . ) ) { return $t; } )* & { return count( $r ) > 0 \|\| $q !== ''; } { array_unshift( $r, $q ); return TokenizerUtils::flattenString( $r ); } // Attribute value, quoted variants can span multiple lines. // Missing end quote: accept /> look-ahead as heuristic. // These need to be kept in sync with the attribute_preprocessor_text_* generic_att_value = s:$(space_or_newline* "'") t:attribute_preprocessor_text_single? q:$("'" / &('/'? '>')) { return TokenizerUtils::getAttrVal( $t, $this->startOffset() + strlen( $s ), $this->endOffset() - strlen( $q ) ); } / s:$(space_or_newline* '"') t:attribute_preprocessor_text_double? q:$('"' / &('/'? '>')) { return TokenizerUtils::getAttrVal( $t, $this->startOffset() + strlen( $s ), $this->endOffset() - strlen( $q ) ); } / s:$space_or_newline* t:attribute_preprocessor_text &(space_or_newline / eof / '/'? '>') { return TokenizerUtils::getAttrVal( $t, $this->startOffset() + strlen( $s ), $this->endOffset() ); } // Attribute value, restricted to a single line. // Missing end quote: accept \|, !!, \r, and \n look-ahead as heuristic. // These need to be kept in sync with the table_attribute_preprocessor_text_* table_att_value = s:$(space* "'") t:table_attribute_preprocessor_text_single? q:$("'" / &('!!' / [\|\r\n])) { return TokenizerUtils::getAttrVal( $t, $this->startOffset() + strlen( $s ), $this->endOffset() - strlen( $q ) ); } / s:$(space* '"') t:table_attribute_preprocessor_text_double? q:$('"' / &('!!' / [\|\r\n])) { return TokenizerUtils::getAttrVal( $t, $this->startOffset() + strlen( $s ), $this->endOffset() - strlen( $q ) ); } / s:$space* t:table_attribute_preprocessor_text &(space_or_newline/ eof / '!!' / '\|') { return TokenizerUtils::getAttrVal( $t, $this->startOffset() + strlen( $s ), $this->endOffset() ); } /********************************************************* * Lists ********************************************************/ list_item = dtdd / hacky_dl_uses / li li = bullets:list_char+ c:inlineline? // The inline_break is to check if we've hit a template end delimiter. &(eolf / inline_breaks) { // Leave bullets as an array -- list handler expects this $tsr = $this->tsrOffsets( 'start' ); $tsr->end += count( $bullets ); $dp = new DataParsoid; $dp->tsr = $tsr; $li = new TagTk( 'listItem', [ new KV( 'bullets', $bullets, $tsr->expandTsrV() ) ], $dp ); return array_merge( [ $li ], $c ?: [] ); } / * This rule is required to support wikitext of this form * ::{\|border="1"\|foo\|bar\|baz\|} * where the leading colons are used to indent the entire table. * This hack was added back in 2006 in commit * a0746946312b0f1eda30a2c793f5f7052e8e5f3a based on a patch by Carl * Fürstenberg. / hacky_dl_uses = bullets:":"+ tbl:(table_line (sol+ table_line)) line:inlineline? &comment_space_eolf { // Leave bullets as an array -- list handler expects this $tsr = $this->tsrOffsets( 'start' ); $tsr->end += count( $bullets ); $dp = new DataParsoid; $dp->tsr = $tsr; $li = new TagTk( 'listItem', [ new KV( 'bullets', $bullets, $tsr->expandTsrV() ) ], $dp ); return TokenizerUtils::flattenIfArray( [ $li, $tbl, $line ?: [] ] ); } dtdd = bullets:(!(";" !list_char) lc:list_char { return $lc; })* ";" c:inlineline_break_on_colon? cpos:(":" { return $this->endOffset(); }) d:inlineline? &eolf { // Leave bullets as an array -- list handler expects this // TSR: +1 for the leading ";" $numBullets = count( $bullets ) + 1; $tsr = $this->tsrOffsets( 'start' ); $tsr->end += $numBullets; $li1Bullets = $bullets; $li1Bullets[] = ';'; $dp = new DataParsoid; $dp->tsr = $tsr; $li1 = new TagTk( 'listItem', [ new KV( 'bullets', $li1Bullets, $tsr->expandTsrV() ) ], $dp ); // TSR: -1 for the intermediate ":" $li2Bullets = $bullets; $li2Bullets[] = ':'; $tsr2 = new SourceRange( $cpos - 1, $cpos ); $dp2 = new DataParsoid; $dp2->tsr = $tsr2; $dp2->stx = 'row'; $li2 = new TagTk( 'listItem', [ new KV( 'bullets', $li2Bullets, $tsr2->expandTsrV() ) ], $dp2 ); return array_merge( [ $li1 ], $c ?: [], [ $li2 ], $d ?: [] ); } list_char = [#:;] inlineline_break_on_colon = inlineline<colon> /***************************************************************************** * Tables * ------ * Table rules are geared to support independent parsing of fragments in * templates (the common table start / row / table end use case). The tokens * produced by these fragments then match up to a table while building the * DOM tree. For similar reasons, table rows do not emit explicit end tag * tokens. * * The separate table_line rule is faster than moving those rules * directly to block_lines. * * Notes about the full_table_in_link_caption rule * ----------------------------------------------------- * However, for link-tables, we have introduced a stricter parse wherein * we require table-start and table-end tags to not come from a template. * In addition, this new rule doesn't accept fosterable-content in * the table unlike the more lax (sol table_line)+ rule. * * This is the best we can do at this time since we cannot distinguish * between table rows and image options entirely in the tokenizer. * * Consider the following examples: * * Example 1: * * [[Image:Foo.jpg\|left\|30px\|Example 1 * {{This-template-returns-a-table-start-tag}} * \|foo * {{This-template-returns-a-table-end-tag}} * ]] * * Example 2: * * [[Image:Foo.jpg\|left\|30px\|Example 1 * {{1x\|a}} * \|foo * {{1x\|b}} * ]] * * So, we cannot know a priori (without preprocessing or fully expanding * all templates) if "\|foo" in the two examples is a table cell or an image * option. This is a limitation of our tokenizer-based approach compared to * the preprocessing-based approach of the old parser. * * Given this limitation, we are okay forcing a full-table context in * link captions (if necessary, we can relax the fosterable-content requirement * but that is broken wikitext anyway, so we can force that edge-case wikitext * to get fixed by rejecting it). *****************************************************************************/ full_table_in_link_caption = !inline_breaks // Note that "linkdesc" is suppressed here to provide a nested parsing // context in which to parse the table. Otherwise, we may break on // on pipes in the `table_start_tag` and `table_row_tag` attributes. // However, as a result, this can be more permissive than the old // implementation (legacy parser?), but likelier to match the users intent. // Suppress the recursion protection from tableDataBlock since we're trying // to parse a full table and if the link is itself nested in a table this // will always stop. Hopefully, this won't result in any overflows. r: full_table_in_link_caption_parameterized<linkdesc=false, table, tableDataBlock=false> { return $r; } full_table_in_link_caption_parameterized = table_start_tag // Accept multiple end tags since a nested table may have been // opened in the table content line. ( (sol+ (table_content_line / tplarg_or_template)) sol+ table_end_tag )+ // This rule assumes start-of-line position! table_line = (! inline_breaks / & '{{!}}' ) tl:( table_start_tag / table_content_line<table> / table_end_tag ) { return $tl; } table_content_line = (space / comment)* ( table_heading_tags / table_row_tag / table_data_tags / table_caption_tag ) table_start_tag "table_start_tag" = sc:(space / comment)* startPos:("" { return $this->endOffset(); }) b:"{" p:pipe // ok to normalize away stray \|} on rt (see T59360) ta:(table_attributes<table=false> / &{ $this->unreachable(); }) tsEndPos:("" { return $this->endOffset(); }) s2:space* { $coms = TokenizerUtils::popComments( $ta ); if ( $coms ) { $tsEndPos = $coms['commentStartPos']; } $dp = new DataParsoid; $dp->tsr = new SourceRange( $startPos, $tsEndPos ); if ( $p !== '\|' ) { // Variation from default $dp->startTagSrc = $b . $p; } return array_merge( $sc, [ new TagTk( 'table', $ta, $dp ) ], $coms ? $coms['buf'] : [], $s2 ); } // FIXME: Not sure if we want to support it, but this should allow columns. table_caption_tag = // avoid recursion via nested_block_in_table ! <tableDataBlock> p:pipe "+" args:row_syntax_table_args? tagEndPos:("" { return $this->endOffset(); }) c:nested_block_in_table* { $tsr = new SourceRange( $this->startOffset(), $tagEndPos ); return TokenizerUtils::buildTableTokens( 'caption', '\|+', $args, $tsr, $this->endOffset(), $c, true ); } table_row_tag = // avoid recursion via nested_block_in_table ! <tableDataBlock> p:pipe dashes:$"-"+ a:(table_attributes<table=false> / &{ $this->unreachable(); }) tagEndPos:("" { return $this->endOffset(); }) s2:space* { $coms = TokenizerUtils::popComments( $a ); if ( $coms ) { $tagEndPos = $coms['commentStartPos']; } $da = new DataParsoid; $da->tsr = new SourceRange( $this->startOffset(), $tagEndPos ); $da->startTagSrc = $p . $dashes; // We rely on our tree builder to close the row as needed. This is // needed to support building tables from fragment templates with // individual cells or rows. $trToken = new TagTk( 'tr', $a, $da ); return array_merge( [ $trToken ], $coms ? $coms['buf'] : [], $s2 ); } tds = ( pp:( pipe_pipe / p:pipe & row_syntax_table_args { return $p; } ) tdt:table_data_tag { // Avoid modifying cached dataAttribs object $tdt[0] = clone $tdt[0]; $da = $tdt[0]->dataAttribs = clone $tdt[0]->dataAttribs; $da->tsr = clone $da->tsr; $da->stx = 'row'; $da->tsr->start -= strlen( $pp ); // include "\|\|" if ( $pp !== '\|\|' \|\| ( isset( $da->startTagSrc ) && $da->startTagSrc !== $pp ) ) { // Variation from default $da->startTagSrc = $pp . ( isset( $da->startTagSrc ) ? substr( $da->startTagSrc, 1 ) : '' ); } return $tdt; } )* table_data_tags = // avoid recursion via nested_block_in_table ! <tableDataBlock> p:pipe ![+-] td:table_data_tag tagEndPos:("" { return $this->endOffset(); }) tds:tds { // Avoid modifying a cached result $td[0] = clone $td[0]; $da = $td[0]->dataAttribs = clone $td[0]->dataAttribs; $da->tsr = clone $da->tsr; $da->tsr->start -= strlen( $p ); // include "\|" if ( $p !== '\|' ) { // Variation from default $da->startTagSrc = $p; } return array_merge( $td, $tds ); } table_data_tag = ! "}" arg:row_syntax_table_args? // use inline_breaks to break on tr etc tagEndPos:("" { return $this->endOffset(); }) td:nested_block_in_table* { $tsr = new SourceRange( $this->startOffset(), $tagEndPos ); return TokenizerUtils::buildTableTokens( 'td', '\|', $arg, $tsr, $this->endOffset(), $td ); } table_heading_tags = table_heading_tags_parameterized<&th> table_heading_tags_parameterized = "!" thTag:table_heading_tag thTags:( pp:("!!" / pipe_pipe) tht:table_heading_tag { // Avoid modifying a cached result $tht[0] = clone $tht[0]; $da = $tht[0]->dataAttribs = clone $tht[0]->dataAttribs; $da->tsr = clone $da->tsr; $da->stx = 'row'; $da->tsr->start -= strlen( $pp ); // include "!!" or "\|\|" if ( $pp !== '!!' \|\| ( isset( $da->startTagSrc ) && $da->startTagSrc !== $pp ) ) { // Variation from default $da->startTagSrc = $pp . ( isset( $da->startTagSrc ) ? substr( $da->startTagSrc, 1 ) : '' ); } return $tht; } )* { $thTag[0] = clone $thTag[0]; $da = $thTag[0]->dataAttribs = clone $thTag[0]->dataAttribs; $da->tsr = clone $da->tsr; $da->tsr->start--; // include "!" array_unshift( $thTags, $thTag ); return $thTags; } table_heading_tag = arg:row_syntax_table_args? tagEndPos:("" { return $this->endOffset(); }) c:( th:<&th> d:nested_block_in_table { if ( $th !== false && strpos( $this->text(), "\n" ) !== false ) { // There's been a newline. Remove the break and continue // tokenizing nested_block_in_tables. $th = false; } return $d; } )* { $tsr = new SourceRange( $this->startOffset(), $tagEndPos ); return TokenizerUtils::buildTableTokens( 'th', '!', $arg, $tsr, $this->endOffset(), $c ); } table_end_tag = sc:(space / comment)* startPos:("" { return $this->endOffset(); }) p:pipe b:"}" { $dp = new DataParsoid; $dp->tsr = new SourceRange( $startPos, $this->endOffset() ); $tblEnd = new EndTagTk( 'table', [], $dp ); if ( $p !== '\|' ) { // p+"<brace-char>" is triggering some bug in pegJS // I cannot even use that expression in the comment! $tblEnd->dataAttribs->endTagSrc = $p . $b; } array_push( $sc, $tblEnd ); return $sc; } /** * Table parameters separated from the content by a single pipe. Does not * match if followed by double pipe (row-based syntax). / row_syntax_table_args = as:table_attributes<tableCellArg> s:optional_spaces p:pipe !pipe { return [ $as, $s, $p ]; } /****************************************************************** * Text variants and other general rules ******************************************************************/ / All chars that cannot start syntactic structures in the middle of a line * XXX: ] and other end delimiters should probably only be activated inside * structures to avoid unnecessarily leaving the text rule on plain * content. * * TODO: Much of this is should really be context-dependent (syntactic * flags). The wikilink_preprocessor_text rule is an example where * text_char is not quite right and had to be augmented. Try to minimize / * clarify this carefully! * * This character class is inlined into $this->reUrltextLookahead. Changes * here may also need to be reflected there. / text_char = [^-'<[{\n\r:;\]}\|!=] / Legend * ' quotes (italic/bold) * < start of xmlish_tag * [ start of links * { start of parser functions, transclusion and template args * \n all sort of block-level markup at start of line * \r ditto * A-Za-z autolinks (http(s), nttp(s), mailto, ISBN, PMID, RFC) * * _ behavior switches (e.g., '__NOTOC__') (XXX: not URL related) * ! and \| table cell delimiters, might be better to specialize those * = headings - also specialize those! * * The following chars are also included for now, but only apply in some * contexts and should probably be enabled only in those: * : separate definition in ; term : definition * ] end of link * } end of parser func/transclusion/template arg * - start of lang_variant -{ ... }- * ; separator in lang_variant / urltext = ( / Very special performance hack: * Look for a plain text sequence, and if found, pretend to match the * empty string, but then advance currPos in the action and return the * whole plain text segment as a single result. / & { if ( preg_match( $this->reUrltextLookahead, $this->input, $m, 0, $this->currPos ) ) { $plain = $m[1]; $this->urltextPlainSegment = $plain; $this->urltextFoundAutolink = ( $m[2] ?? '' ) !== ''; return (bool)strlen( $plain ); } else { $this->urltextFoundAutolink = false; return false; } } '' { $this->currPos += strlen( $this->urltextPlainSegment ); return $this->urltextPlainSegment; } / & { return $this->urltextFoundAutolink; } al:autolink { return $al; } / & "&" he:htmlentity { return $he; } / & ('__') bs:behavior_switch { return $bs; } / text_char )+ raw_htmlentity = m:$("&" [#0-9a-zA-Zרלמرلم]+ ";") { return Utils::decodeWtEntities( $m ); } htmlentity = cc:raw_htmlentity { // if this is an invalid entity, don't tag it with 'mw:Entity' // note that some entities (like &acE;) decode to 2 codepoints! if ( mb_strlen( $cc ) > 2 / decoded entity would be 1-2 codepoints / ) { return $cc; } $dpStart = new DataParsoid; $dpStart->src = $this->text(); $dpStart->srcContent = $cc; $dpStart->tsr = $this->tsrOffsets( 'start' ); $dpEnd = new DataParsoid; $dpEnd->tsr = $this->tsrOffsets( 'end' ); return [ // If this changes, the nowiki extension's toDOM will need to follow suit new TagTk( 'span', [ new KV( 'typeof', 'mw:Entity' ) ], $dpStart ), $cc, new EndTagTk( 'span', [], $dpEnd ) ]; } spaces = $[ \t]+ optional_spaces = $[ \t] space = [ \t] optionalSpaceToken = s:optional_spaces { if ( $s !== '' ) { return [ $s ]; } else { return []; } } /* This rule corresponds to \s in the PHP preg_* functions, * which is used frequently in the old parser. The inclusion of * form feed (but not other whitespace, like vertical tab) is a quirk * of Perl, which PHP inherited via the PCRE (Perl-Compatible Regular * Expressions) library. / space_or_newline = [ \t\n\r\x0c] / This rule corresponds to \b in the PHP preg_* functions, * after a word character. That is, it's a zero-width lookahead that * the next character is not a word character. / end_of_word = eof / ![A-Za-z0-9_] // Unicode "separator, space" category. It covers the \u0020 space as well // as \u3000 IDEOGRAPHIC SPACE (see bug 19052). In PHP this is \p{Zs}. // Keep this up-to-date with the characters tagged ;Zs; in // http://www.unicode.org/Public/UNIDATA/UnicodeData.txt unispace = [ \u00A0\u1680\u2000-\u200A\u202F\u205F\u3000] // Non-newline whitespace, including non-breaking spaces. Used for magic links. space_or_nbsp = space // includes \t / unispace / & "&" he:htmlentity &{ return is_array( $he ) && $he[ 1 ] === "\u{A0}"; } { return $he; } // Used within ISBN magic links space_or_nbsp_or_dash = space_or_nbsp / "-" // Elements that do not break beginning or end of line for blocks (headers for instance) comment_include_annotation = comment / include_limits<sol_il> / annotation_tag sol = (empty_line_with_comments / sol_prefix) comment_include_annotation sol_prefix = newlineToken / & { // Use the sol flag only at the start of the input // Flag should always be an actual boolean (not falsy or undefined) $this->assert( is_bool( $this->options['sol'] ), 'sol should be boolean' ); return $this->endOffset() === 0 && $this->options['sol']; } { return []; } empty_line_with_comments = sp:sol_prefix p:("" { return $this->endOffset(); }) c:(space* comment (space / comment)* newline)+ { $dp = new DataParsoid; $dp->tsr = new SourceRange( $p, $this->endOffset() ); $dp->tokens = TokenizerUtils::flattenIfArray( $c ); return [ $sp, new SelfclosingTagTk( 'meta', [ new KV( 'typeof', 'mw:EmptyLine' ) ], $dp ) ]; } comment_space = comment / space nl_comment_space = newlineToken / comment_space /** * noinclude / includeonly / onlyinclude rules. These are normally * handled by the xmlish_tag rule, except where generic tags are not * allowed- for example in directives, which are allowed in various attribute * names and -values. * * Example test case: * {\| * \|-<includeonly> * foo * </includeonly> * \|Hello * \|} / include_limits = & ("<" "/"? n:("includeonly"i / "noinclude"i / "onlyinclude"i ) ) il:xmlish_tag sol_il: <sol_il> & { $il = $il[0]; $lname = mb_strtolower( $il->getName() ); if ( !WTUtils::isIncludeTag( $lname ) ) { return false; } // Preserve SOL where necessary (for onlyinclude and noinclude) // Note that this only works because we encounter <include> tags in // the toplevel content and we rely on the php preprocessor to expand // templates, so we shouldn't ever be tokenizing inInclude. // Last line should be empty (except for comments) if ( $lname !== 'includeonly' && $sol_il && $il instanceof TagTk ) { $dp = $il->dataAttribs; $inclContent = $dp->extTagOffsets->stripTags( $dp->src ); $nlpos = strrpos( $inclContent, "\n" ); $last = $nlpos === false ? $inclContent : substr( $inclContent, $nlpos + 1 ); if ( !preg_match( '/^(<!--([^-]\|-(?!->))-->)$/D', $last ) ) { return false; } } return true; } { return $il; } // Start of file sof = & { return $this->endOffset() === 0 && !$this->pipelineOffset; } // End of file eof = & { return $this->endOffset() === $this->inputLength; } newline = '\n' / '\r\n' newlineToken = newline { return [ new NlTk( $this->tsrOffsets() ) ]; } eolf = newline / eof comment_space_eolf = (space+ / comment) eolf // 'Preprocessor' directive- higher-level things that can occur in otherwise // plain-text content. directive = comment / extension_annotation_tag / tplarg_or_template / & "-{" v:lang_variant_or_tpl { return $v; } / & "&" e:htmlentity { return $e; } / include_limits wikilink_preprocessor_text = r:( t:$[^<[{\n\r\t\|!\]}{ &\-]+ // XXX gwicke: any more chars we need to allow here? / !inline_breaks wr:( directive / $( !"]]" ( text_char / [!<\-\}\]\n\r] ) ) ) { return $wr; } )+ { return TokenizerUtils::flattenStringlist( $r ); } // added special separator character class inline: separates url from // description / text extlink_nonipv6url = // Prevent breaking on pipes when we're in a link description. // See the test, 'Images with the "\|" character in the comment'. extlink_nonipv6url_parameterized<linkdesc=false> extlink_nonipv6url_parameterized = r:( $[^<[{\n\r\|!\]}\-\t&="' \u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]+ / !inline_breaks s:( directive / [&\|{\-!}=] ) { return $s; } / $(['] ![']) // single quotes are ok, double quotes are bad )+ { return TokenizerUtils::flattenString( $r ); } // Attribute values with preprocessor support // n.b. / is a permissible char in the three rules below. // We only break on />, enforced by the negated expression. // Hence, it isn't included in the stop set. // The stop set is space_or_newline and > which matches generic_att_value. attribute_preprocessor_text = r:( $[^{}&<\-\|/ \t\n\r\x0c>]+ / !inline_breaks !'/>' s:( directive / less_than / [{}&\-\|/] ) { return $s; } )+ { return TokenizerUtils::flattenString( $r ); } // The stop set is '> which matches generic_att_value. attribute_preprocessor_text_single = r:( $[^{}&<\-\|/'>]+ / !inline_breaks !'/>' s:( directive / less_than / [{}&\-\|/] ) { return $s; } )* { return TokenizerUtils::flattenString( $r ); } // The stop set is "> which matches generic_att_value. attribute_preprocessor_text_double = r:( $[^{}&<\-\|/">]+ / !inline_breaks !'/>' s:( directive / less_than / [{}&\-\|/] ) { return $s; } )* { return TokenizerUtils::flattenString( $r ); } // Variants with the entire attribute on a single line // n.b. ! is a permissible char in the three rules below. // We only break on !! in th, enforced by the inline break. // Hence, it isn't included in the stop set. // [ is also permissible but we give a chance to break // for the [[ special case in the old parser's doTableStuff (See T2553). // The stop set is space_or_newline and \| which matches table_att_value. table_attribute_preprocessor_text = r:( $[^{}&<\-!\[ \t\n\r\x0c\|]+ / !inline_breaks s:( directive / [{}&<\-!\[] ) { return $s; } )+ { return TokenizerUtils::flattenString( $r ); } // The stop set is '\r\n\| which matches table_att_value. table_attribute_preprocessor_text_single = r:( $[^{}&<\-!\['\r\n\|]+ / !inline_breaks s:( directive / [{}&<\-!\[] ) { return $s; } )* { return TokenizerUtils::flattenString( $r ); } // The stop set is "\r\n\| which matches table_att_value. table_attribute_preprocessor_text_double = r:( $[^{}&<\-!\["\r\n\|]+ / !inline_breaks s:( directive / [{}&<\-!\[] ) { return $s; } )* { return TokenizerUtils::flattenString( $r ); } // Special-case support for those pipe templates pipe = "\|" / "{{!}}" // SSS FIXME: what about \|{{!}} and {{!}}\| pipe_pipe = "\|\|" / "{{!}}{{!}}"

| ver. 1.4 | Github | . | PHP 7.4.33 | ��֧ߧ֧�ѧ�ڧ� ��ѧߧڧ��: 0.29 | proxy | phpinfo | ��ѧ��ۧܧ�