diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..a1eb82d --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,20 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Listen for XDebug", + "type": "php", + "request": "launch", + "port": 9003, + "hostname": "0.0.0.0", + // Path to your source in container + "pathMappings": { + "/var/www/html/extensions/LinkTitles": "${workspaceFolder}", + "/var/www/html": "${workspaceFolder}/../.." + } + } + ] +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..dec34dd --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,6 @@ +{ + "intelephense.environment.includePaths": [ + "../../includes", + "../../maintenance/includes", + ] +} \ No newline at end of file diff --git a/README.md b/README.md index cf58a7e..ff03ffb 100644 --- a/README.md +++ b/README.md @@ -369,6 +369,12 @@ page, but there may be a page title "Mg" which redirects to the page "Magnesium". This settings prevents erroneous linking to very short titles by setting a minimum length. You can adjust this setting to your liking. + + $wgLinkTitlesMaximumTitleLength = 25; + +Only link to page titles that have a certain maximum length. By filtering out pages +that have a very low chance to be written out exactly the same way, we save on performance. + ### Excluding pages from being linked to $wgLinkTitlesBlackList = []; diff --git a/extension.json b/extension.json index 33a25ab..feae620 100644 --- a/extension.json +++ b/extension.json @@ -28,6 +28,7 @@ "LinkTitlesCheckRedirect": true, "LinkTitlesEnableNoTargetMagicWord": false, "LinkTitlesMinimumTitleLength": 4, + "LinkTitlesMaximumTitleLength": 25, "LinkTitlesBlackList": [], "LinkTitlesFirstOnly": true, "LinkTitlesSmartMode": true, diff --git a/includes/Config.php b/includes/Config.php index e9bd5d9..3b6a5df 100644 --- a/includes/Config.php +++ b/includes/Config.php @@ -60,6 +60,12 @@ class Config { */ public $minimumTitleLength; + /** + * Maximum length of a page title for it to qualify as a potential link target. + * @var int $maximumTitleLength + */ + public $maximumTitleLength; + /** * Array of page titles that must never be link targets. * @@ -199,6 +205,7 @@ public function __construct() { global $wgLinkTitlesParseOnRender; global $wgLinkTitlesPreferShortTitles; global $wgLinkTitlesMinimumTitleLength; + global $wgLinkTitlesMaximumTitleLength; global $wgLinkTitlesBlackList; global $wgLinkTitlesSourceNamespaces; global $wgLinkTitlesTargetNamespaces; @@ -218,6 +225,7 @@ public function __construct() { $this->parseOnRender = $wgLinkTitlesParseOnRender; $this->preferShortTitles = $wgLinkTitlesPreferShortTitles; $this->minimumTitleLength = $wgLinkTitlesMinimumTitleLength; + $this->maximumTitleLength = $wgLinkTitlesMaximumTitleLength; $this->blackList = $wgLinkTitlesBlackList; $this->sourceNamespaces = $wgLinkTitlesSourceNamespaces ? $wgLinkTitlesSourceNamespaces : [ NS_MAIN ]; $this->targetNamespaces = $wgLinkTitlesTargetNamespaces; diff --git a/includes/Extension.php b/includes/Extension.php index 0edcd70..de99b54 100644 --- a/includes/Extension.php +++ b/includes/Extension.php @@ -104,15 +104,17 @@ public static function onInternalParseBeforeLinks( \Parser &$parser, &$text ) { * * @param \Title $title Title object. * @param \RequestContext $context Current request context. If in doubt, call MediaWiki's `RequestContext::getMain()` to obtain such an object. + * @param bool $dryRun When true, no change will be done to any page, but the log of target pages will still be produced + * @param string $targetPageTitle When not empty, will be the only replaced linked in the source page * @return bool True if the page exists, false if the page does not exist */ - public static function processPage( \Title $title, \RequestContext $context ) { + public static function processPage( \Title $title, \RequestContext $context, $dryRun = false, $targetPageTitle = "" ) { $config = new Config(); $source = Source::createFromTitle( $title, $config ); if ( $source->hasContent() ) { $linker = new Linker( $config ); - $result = $linker->linkContent( $source ); - if ( $result ) { + $result = $linker->linkContent( $source, $targetPageTitle ); + if ( $result && !$dryRun ) { $content = $source->getContent()->getContentHandler()->unserializeContent( $result ); $updater = $source->getPage()->newPageUpdater( $context->getUser()); diff --git a/includes/Linker.php b/includes/Linker.php index 13428ff..f437611 100644 --- a/includes/Linker.php +++ b/includes/Linker.php @@ -68,9 +68,10 @@ public function __construct( Config &$config ) { * * @param \Title &$title Title object for the current page. * @param String $text String that holds the article content + * @param string $targetPageTitle When not empty, will be the only replaced linked in the source page * @return String|null Source page text with links to target pages, or null if no links were added */ - public function linkContent( Source $source ) { + public function linkContent( Source $source, $targetPageTitle = "" ) { if ( self::$locked > 0 || !$source->canBeLinked() ) { return; } @@ -80,7 +81,7 @@ public function linkContent( Source $source ) { $newLinks = false; // whether or not new links were added $newText = $source->getText(); $splitter = Splitter::singleton( $this->config ); - $targets = Targets::singleton( $source->getTitle(), $this->config ); + $targets = Targets::singleton( $source->getTitle(), $this->config, $targetPageTitle ); // Iterate through the target page titles foreach( $targets->queryResult as $row ) { @@ -105,6 +106,12 @@ public function linkContent( Source $source ) { // Credits to inhan @ StackOverflow for suggesting preg_split. // See http://stackoverflow.com/questions/10672286 $arr = $splitter->split( $newText ); + if ($arr === false) + { + echo "Error while trying to parse Title ". $source->getTitle() ."\n". preg_last_error() . " " . preg_last_error_msg() . "\n"; + return; + } + $count = 0; // Cache the target title text for the regex callbacks @@ -124,6 +131,7 @@ public function linkContent( Source $source ) { if ( $count > 0 ) { $newLinks = true; $newText = implode( '', $arr ); + Targets::incrementTargetCount( $target->getPrefixedTitleText() ); } // If smart mode is turned on, the extension will perform a second @@ -148,6 +156,7 @@ public function linkContent( Source $source ) { if ( $count > 0 ) { $newLinks = true; $newText = implode( '', $arr ); + Targets::incrementTargetCount( $target->getPrefixedTitleText() ); } } // $wgLinkTitlesSmartMode }; // foreach $res as $row diff --git a/includes/Source.php b/includes/Source.php index c966987..8f0f072 100644 --- a/includes/Source.php +++ b/includes/Source.php @@ -134,7 +134,9 @@ private function __construct( Config $config) { * @return [type] [description] */ public function canBeLinked() { - return $this->hasDesiredNamespace() && !$this->hasNoAutolinksMagicWord(); + return $this->hasDesiredNamespace() && + !$this->hasNoAutolinksMagicWord() && + !$this->isRedirect(); } /** @@ -146,6 +148,13 @@ public function hasDesiredNamespace() { return in_array( $this->getTitle()->getNamespace(), $this->config->sourceNamespaces ); } + /** + * Stops pages that are redirects from being modified + */ + private function isRedirect() { + return $this->getTitle()->isRedirect(); + } + /** * Determines whether the source page contains the __NOAUTOLINKS__ magic word. * @@ -166,7 +175,7 @@ public function getTitle() { if ( $this->page != null) { $this->title = $this->page->getTitle(); } else { - throw new Exception( 'Unable to create Title for this Source because Page is null.' ); + throw new \Exception( 'Unable to create Title for this Source because Page is null.' ); } } return $this->title; @@ -240,7 +249,7 @@ public function getPage() { if ( $this->title != null) { $this->page = static::getPageObject( $this->title ); } else { - throw new Exception( 'Unable to create Page for this Source because Title is null.' ); + throw new \Exception( 'Unable to create Page for this Source because Title is null.' ); } } return $this->page; diff --git a/includes/Splitter.php b/includes/Splitter.php index a8fcfa8..e82b09a 100644 --- a/includes/Splitter.php +++ b/includes/Splitter.php @@ -85,6 +85,9 @@ protected function __construct( Config $config) { * @return Array of strings where even indexes point to linkable sections. */ public function split( &$text ) { + ini_set( 'pcre.jit', false ); + ini_set( 'pcre.recursion_limit', "500000" ); + return preg_split( $this->splitter, $text, -1, PREG_SPLIT_DELIM_CAPTURE ); } diff --git a/includes/Target.php b/includes/Target.php index eb42076..3280a9a 100644 --- a/includes/Target.php +++ b/includes/Target.php @@ -67,6 +67,9 @@ class Target { private $nsText; + private static $pagesWithMagicWord; + private static $pagesRedirects; + /** * Constructs a new Target object * @@ -77,7 +80,6 @@ class Target { */ public function __construct( $namespace, $title, Config &$config ) { $this->title = \Title::makeTitleSafe( $namespace, $title ); - $this->titleValue = $this->title->getTitleValue(); $this->config = $config; // Use unicode character properties rather than \b escape sequences @@ -159,7 +161,7 @@ public function getCaseInsensitiveRegex() { * @return String regular expression pattern */ private function buildRegex( $searchTerm ) { - return '/(?wordStart . $searchTerm . $this->wordEnd . '/S'; + return '/(?wordStart . $searchTerm . $this->wordEnd . '/Su'; } /** @@ -217,9 +219,14 @@ public function mayLinkTo( Source $source ) { // page does indeed contain this magic word, return the page title // as-is (unlinked). if ( $this->config->enableNoTargetMagicWord ) { - if ( $this->getContent()->matchMagicWord( \MediaWiki\MediaWikiServices::getInstance()->getMagicWordFactory()->get( 'MAG_LINKTITLES_NOTARGET' ) ) ) { - return false; + if (!isset(self::$pagesWithMagicWord[$this->getPrefixedTitleText()])) + { + self::$pagesWithMagicWord[$this->getPrefixedTitleText()] = false; + if ( $this->getContent() ) + self::$pagesWithMagicWord[$this->getPrefixedTitleText()] = $this->getContent()->matchMagicWord( \MediaWiki\MediaWikiServices::getInstance()->getMagicWordFactory()->get( 'MAG_LINKTITLES_NOTARGET' ) ); } + + return !self::$pagesWithMagicWord[$this->getPrefixedTitleText()]; }; return true; } @@ -239,14 +246,20 @@ public function isSameTitle( Source $source) { * @return bool True if the target redirects to the source. */ public function redirectsTo( $source ) { - if ( $this->getContent() ) { - if ( version_compare( MW_VERSION, '1.38', '>=' ) ) { - $redirectTitle = $this->getContent()->getRedirectTarget(); - } else { - $redirectTitle = $this->getContent()->getUltimateRedirectTarget(); + if (!isset(self::$pagesRedirects[$this->getPrefixedTitleText()])) + { + self::$pagesRedirects[$this->getPrefixedTitleText()] = null; + + if ( $this->getContent() ) { + if ( version_compare( MW_VERSION, '1.38', '>=' ) ) { + self::$pagesRedirects[$this->getPrefixedTitleText()] = $this->getContent()->getRedirectTarget(); + } else { + self::$pagesRedirects[$this->getPrefixedTitleText()] = $this->getContent()->getUltimateRedirectTarget(); + } } - return $redirectTitle && $redirectTitle->equals( $source->getTitle() ); } + + return self::$pagesRedirects[$this->getPrefixedTitleText()] && self::$pagesRedirects[$this->getPrefixedTitleText()]->equals( $source->getTitle() ); } /** diff --git a/includes/Targets.php b/includes/Targets.php index cf62205..698c932 100644 --- a/includes/Targets.php +++ b/includes/Targets.php @@ -29,6 +29,11 @@ */ class Targets { private static $instance; + + /** + * Stores a list of pages that have been linked into other pages + */ + private static $includedPages; /** * Singleton factory that returns a (cached) database query results with @@ -41,12 +46,27 @@ class Targets { * @param String $sourceNamespace The namespace of the current page. * @param Config $config LinkTitles configuration. */ - public static function singleton( \Title $title, Config $config ) { + public static function singleton( \Title $title, Config $config, $targetPageTitle = "" ) { if ( ( self::$instance === null ) || ( self::$instance->sourceNamespace != $title->getNamespace() ) ) { - self::$instance = new Targets( $title, $config ); + self::$instance = new Targets( $title, $config, $targetPageTitle ); } return self::$instance; } + + public static function incrementTargetCount($pageTitle) + { + if (!isset(self::$includedPages[$pageTitle])) + self::$includedPages[$pageTitle] = 0; + + self::$includedPages[$pageTitle]++; + } + + public static function getTargetedPages() : Array { + if (empty(self::$includedPages)) + self::$includedPages = []; + + return self::$includedPages; + } /** * Invalidates the cache; the next call of Targets::singleton() will trigger @@ -85,26 +105,39 @@ public static function invalidate() { * The constructor is private to enforce using the singleton pattern. * @param \Title $title */ - private function __construct( \Title $title, Config $config) { + private function __construct( \Title $title, Config $config, $targetPageTitle = "" ) { $this->config = $config; $this->sourceNamespace = $title->getNamespace(); - $this->fetch(); + $this->fetch($targetPageTitle); } // /** * Fetches the page titles from the database. */ - private function fetch() { + private function fetch($targetPageTitle = '') { ( $this->config->preferShortTitles ) ? $sortOrder = 'ASC' : $sortOrder = 'DESC'; + $dbr = wfGetDB( DB_REPLICA ); + + $whereClauses = [ + "page_content_model = 'wikitext'" + ]; + // Build a blacklist of pages that are not supposed to be link // targets. This includes the current page. if ( $this->config->blackList ) { - $blackList = 'page_title NOT IN ' . + $whereClauses[] = 'page_title NOT IN ' . str_replace( ' ', '_', '("' . implode( '","', str_replace( '"', '\"', $this->config->blackList ) ) . '")' ); - } else { - $blackList = null; + } + + if ( !empty($targetPageTitle) ) { + $whereClauses[] = 'page_title LIKE ' . $dbr->addQuotes( $targetPageTitle ); + } + else { + // Apply the min max lenght of the page titles: + $whereClauses[] = $this->charLength() . '(page_title) >= ' . $this->config->minimumTitleLength; + $whereClauses[] = $this->charLength() . '(page_title) <= ' . $this->config->maximumTitleLength; } if ( $this->config->sameNamespace ) { @@ -129,23 +162,16 @@ private function fetch() { $weightSelect = $weightSelect . " WHEN " . $namespaceValue . " THEN " . $currentWeight . PHP_EOL; } $weightSelect = $weightSelect . " END "; - $namespacesClause = '(' . implode( ', ', $namespaces ) . ')'; + $whereClauses[] = 'page_namespace IN (' . implode( ', ', $namespaces ) . ')'; // Build an SQL query and fetch all page titles ordered by length from // shortest to longest. Only titles from 'normal' pages (namespace uid // = 0) are returned. Since the db may be sqlite, we need a try..catch // structure because sqlite does not support the CHAR_LENGTH function. - $dbr = wfGetDB( DB_REPLICA ); $this->queryResult = $dbr->select( 'page', array( 'page_title', 'page_namespace' , "weight" => $weightSelect), - array_filter( - array( - 'page_namespace IN ' . $namespacesClause, - $this->charLength() . '(page_title) >= ' . $this->config->minimumTitleLength, - $blackList, - ) - ), + $whereClauses, __METHOD__, array( 'ORDER BY' => 'weight ASC, ' . $this->charLength() . '(page_title) ' . $sortOrder ) ); diff --git a/linktitles-cli.php b/linktitles-cli.php index e3d7b41..d03617b 100755 --- a/linktitles-cli.php +++ b/linktitles-cli.php @@ -1,6 +1,5 @@ #!/usr/bin/env php addOption( + "limit", + "Limit the number of pages to be replaced.", + false, // not required + true, // requires argument + "l" + ); + $this->addOption( + "min_lenght", + "Overides the \$wgLinkTitlesMinimumTitleLength variable", + false, // not required + true, // requires argument + "l" + ); + $this->addOption( + "max_length", + "Overides the \$wgLinkTitlesMaximumTitleLength variable", + false, // not required + true, // requires argument + "l" + ); $this->addOption( "page", "page name to process", @@ -80,6 +100,13 @@ public function __construct() { true, // requires argument "p" ); + $this->addOption( + "target", + "Specify a target page name to link to.", + false, // not required + true, // requires argument + "t" + ); $this->addOption( "verbose", "print detailed progress information", @@ -87,6 +114,13 @@ public function __construct() { false, // does not require an argument "v" ); + $this->addOption( + "dryrun", + "Show pages that are going to be linked into, with the number of occurences", + false, // not required + false, // does not require an argument + "d" + ); // TODO: Add back logging options. // TODO: Add configuration options. // $this->addOption( @@ -128,11 +162,7 @@ public function execute() { } } else { - $startIndex = intval( $this->getOption( 'start', 0 ) ); - if ( $startIndex < 0 ) { - $this->error( 'FATAL: Start index must be 0 or greater.', 1 ); - }; - $this->allPages( $startIndex ); + $this->allPages( ); } } @@ -141,27 +171,53 @@ public function execute() { * @return bool True on success, false on failure. */ private function singlePage() { + $dryRun = $this->hasOption( 'dryrun' ); + $pageName = strval( $this->getOption( 'page' ) ); $this->output( "Processing single page: '$pageName'\n" ); $title = \Title::newFromText( $pageName ); - $success = Extension::processPage( $title, \RequestContext::getMain() ); + $success = Extension::processPage( $title, \RequestContext::getMain(), $dryRun ); if ( $success ) { $this->output( "Finished.\n" ); } else { $this->error( 'FATAL: There is no such page.', 3 ); } + + $this->outputTargetPagesLog(); + return $success; } /** * Process all pages in the Wiki. - * @param integer $index Index of the start page. - * @return bool True on success, false on failure. + * @return bool True on success, false on failure. */ - private function allPages( $index = 0 ) { + private function allPages( ) { $config = new Config(); $verbose = $this->hasOption( 'verbose' ); + $dryRun = $this->hasOption( 'dryrun' ); + $targetPageName = strval( $this->getOption( 'target' ) ); + + $startIndex = intval( $this->getOption( 'start', 0 ) ); + if ( $startIndex < 0 ) { + $this->error( 'FATAL: Start index must be 0 or greater.', 1 ); + }; + + $pageLimit = intval( $this->getOption( 'limit', 0 ) ); + if ( $pageLimit < 0 ) { + $this->error( 'FATAL: limit must be 0 or greater.', 1 ); + }; + if (empty($pageLimit)) + $pageLimit = 999999999; + + $minLength = intval( $this->getOption( 'min_length', 0 ) ); + if (!empty($minLength)) + $GLOBALS['wgLinkTitlesMinimumTitleLength'] = $minLength; + + $maxLength = intval( $this->getOption( 'max_length', 0 ) ); + if (!empty($maxLength)) + $GLOBALS['wgLinkTitlesMaximumTitleLength'] = $maxLength; // Retrieve page names from the database. $dbr = $this->getDB( DB_REPLICA ); @@ -171,22 +227,26 @@ private function allPages( $index = 0 ) { array( 'page_title', 'page_namespace' ), array( 'page_namespace IN ' . $namespacesClause, + 'page_is_redirect = 0', + "page_content_model = 'wikitext'", ), __METHOD__, array( - 'LIMIT' => 999999999, - 'OFFSET' => $index + 'ORDER BY' => 'page_namespace ASC, page_id DESC', + 'LIMIT' => $pageLimit, + 'OFFSET' => $startIndex ) ); + $numPages = $res->numRows(); $context = \RequestContext::getMain(); - $this->output( "Processing {$numPages} pages, starting at index {$index}...\n" ); + $this->output( "Processing {$numPages} pages, starting at index {$startIndex}...\n" ); $numProcessed = 0; foreach ( $res as $row ) { $title = \Title::makeTitleSafe( $row->page_namespace, $row->page_title ); $numProcessed += 1; - $index += 1; + $startIndex += 1; if ( $verbose ) { $this->output( sprintf( @@ -195,17 +255,26 @@ private function allPages( $index = 0 ) { $numProcessed, $numPages, $numProcessed / $numPages * 100, - $index, + $startIndex, $title ) ); } else { - $this->output( sprintf( "\rPage #%d (%02.0f%%) ", $index, $numProcessed / $numPages * 100 ) ); + $this->output( sprintf( "\rPage #%d (%02.0f%%) ", $startIndex, $numProcessed / $numPages * 100 ) ); } - Extension::processPage( $title, $context ); + Extension::processPage( $title, $context, $dryRun, $targetPageName ); } - $this->output( "\rFinished. \n" ); + $this->output( "\rFinished.\n" ); + + $this->outputTargetPagesLog(); + } + + private function outputTargetPagesLog() { + $this->output( "\n\nList of titles that have been used as targets:\n\n" ); + $linkedPages = Targets::getTargetedPages(); + foreach ($linkedPages as $page => $pageCount) + $this->output( "$page\t$pageCount\n" ); } }