Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed issue #68: only try to link titles that are not redirect in LinkTitles-cli.php #69

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Listen for XDebug",
"type": "php",
"request": "launch",
"port": 9003,
"hostname": "0.0.0.0",
// Path to your source in container
"pathMappings": {
"/var/www/html/extensions/LinkTitles": "${workspaceFolder}",
"/var/www/html": "${workspaceFolder}/../.."
}
}
]
}
6 changes: 6 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"intelephense.environment.includePaths": [
"../../includes",
"../../maintenance/includes",
]
}
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,12 @@ page, but there may be a page title "Mg" which redirects to the page
"Magnesium". This settings prevents erroneous linking to very short titles by
setting a minimum length. You can adjust this setting to your liking.


$wgLinkTitlesMaximumTitleLength = 25;
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Love this, excellent idea.


Only link to page titles that have a certain maximum length. By filtering out pages
that have a very low chance to be written out exactly the same way, we save on performance.

### Excluding pages from being linked to

$wgLinkTitlesBlackList = [];
Expand Down
1 change: 1 addition & 0 deletions extension.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
"LinkTitlesCheckRedirect": true,
"LinkTitlesEnableNoTargetMagicWord": false,
"LinkTitlesMinimumTitleLength": 4,
"LinkTitlesMaximumTitleLength": 25,
"LinkTitlesBlackList": [],
"LinkTitlesFirstOnly": true,
"LinkTitlesSmartMode": true,
Expand Down
8 changes: 8 additions & 0 deletions includes/Config.php
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,12 @@ class Config {
*/
public $minimumTitleLength;

/**
* Maximum length of a page title for it to qualify as a potential link target.
* @var int $maximumTitleLength
*/
public $maximumTitleLength;

/**
* Array of page titles that must never be link targets.
*
Expand Down Expand Up @@ -199,6 +205,7 @@ public function __construct() {
global $wgLinkTitlesParseOnRender;
global $wgLinkTitlesPreferShortTitles;
global $wgLinkTitlesMinimumTitleLength;
global $wgLinkTitlesMaximumTitleLength;
global $wgLinkTitlesBlackList;
global $wgLinkTitlesSourceNamespaces;
global $wgLinkTitlesTargetNamespaces;
Expand All @@ -218,6 +225,7 @@ public function __construct() {
$this->parseOnRender = $wgLinkTitlesParseOnRender;
$this->preferShortTitles = $wgLinkTitlesPreferShortTitles;
$this->minimumTitleLength = $wgLinkTitlesMinimumTitleLength;
$this->maximumTitleLength = $wgLinkTitlesMaximumTitleLength;
$this->blackList = $wgLinkTitlesBlackList;
$this->sourceNamespaces = $wgLinkTitlesSourceNamespaces ? $wgLinkTitlesSourceNamespaces : [ NS_MAIN ];
$this->targetNamespaces = $wgLinkTitlesTargetNamespaces;
Expand Down
8 changes: 5 additions & 3 deletions includes/Extension.php
Original file line number Diff line number Diff line change
Expand Up @@ -104,15 +104,17 @@ public static function onInternalParseBeforeLinks( \Parser &$parser, &$text ) {
*
* @param \Title $title Title object.
* @param \RequestContext $context Current request context. If in doubt, call MediaWiki's `RequestContext::getMain()` to obtain such an object.
* @param bool $dryRun When true, no change will be done to any page, but the log of target pages will still be produced
* @param string $targetPageTitle When not empty, will be the only replaced linked in the source page
* @return bool True if the page exists, false if the page does not exist
*/
public static function processPage( \Title $title, \RequestContext $context ) {
public static function processPage( \Title $title, \RequestContext $context, $dryRun = false, $targetPageTitle = "" ) {
$config = new Config();
$source = Source::createFromTitle( $title, $config );
if ( $source->hasContent() ) {
$linker = new Linker( $config );
$result = $linker->linkContent( $source );
if ( $result ) {
$result = $linker->linkContent( $source, $targetPageTitle );
if ( $result && !$dryRun ) {
$content = $source->getContent()->getContentHandler()->unserializeContent( $result );

$updater = $source->getPage()->newPageUpdater( $context->getUser());
Expand Down
13 changes: 11 additions & 2 deletions includes/Linker.php
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,10 @@ public function __construct( Config &$config ) {
*
* @param \Title &$title Title object for the current page.
* @param String $text String that holds the article content
* @param string $targetPageTitle When not empty, will be the only replaced linked in the source page
* @return String|null Source page text with links to target pages, or null if no links were added
*/
public function linkContent( Source $source ) {
public function linkContent( Source $source, $targetPageTitle = "" ) {
if ( self::$locked > 0 || !$source->canBeLinked() ) {
return;
}
Expand All @@ -80,7 +81,7 @@ public function linkContent( Source $source ) {
$newLinks = false; // whether or not new links were added
$newText = $source->getText();
$splitter = Splitter::singleton( $this->config );
$targets = Targets::singleton( $source->getTitle(), $this->config );
$targets = Targets::singleton( $source->getTitle(), $this->config, $targetPageTitle );

// Iterate through the target page titles
foreach( $targets->queryResult as $row ) {
Expand All @@ -105,6 +106,12 @@ public function linkContent( Source $source ) {
// Credits to inhan @ StackOverflow for suggesting preg_split.
// See http://stackoverflow.com/questions/10672286
$arr = $splitter->split( $newText );
if ($arr === false)
{
echo "Error while trying to parse Title ". $source->getTitle() ."\n". preg_last_error() . " " . preg_last_error_msg() . "\n";
return;
}

$count = 0;

// Cache the target title text for the regex callbacks
Expand All @@ -124,6 +131,7 @@ public function linkContent( Source $source ) {
if ( $count > 0 ) {
$newLinks = true;
$newText = implode( '', $arr );
Targets::incrementTargetCount( $target->getPrefixedTitleText() );
}

// If smart mode is turned on, the extension will perform a second
Expand All @@ -148,6 +156,7 @@ public function linkContent( Source $source ) {
if ( $count > 0 ) {
$newLinks = true;
$newText = implode( '', $arr );
Targets::incrementTargetCount( $target->getPrefixedTitleText() );
}
} // $wgLinkTitlesSmartMode
}; // foreach $res as $row
Expand Down
15 changes: 12 additions & 3 deletions includes/Source.php
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,9 @@ private function __construct( Config $config) {
* @return [type] [description]
*/
public function canBeLinked() {
return $this->hasDesiredNamespace() && !$this->hasNoAutolinksMagicWord();
return $this->hasDesiredNamespace() &&
!$this->hasNoAutolinksMagicWord() &&
!$this->isRedirect();
}

/**
Expand All @@ -146,6 +148,13 @@ public function hasDesiredNamespace() {
return in_array( $this->getTitle()->getNamespace(), $this->config->sourceNamespaces );
}

/**
* Stops pages that are redirects from being modified
*/
private function isRedirect() {
return $this->getTitle()->isRedirect();
}

/**
* Determines whether the source page contains the __NOAUTOLINKS__ magic word.
*
Expand All @@ -166,7 +175,7 @@ public function getTitle() {
if ( $this->page != null) {
$this->title = $this->page->getTitle();
} else {
throw new Exception( 'Unable to create Title for this Source because Page is null.' );
throw new \Exception( 'Unable to create Title for this Source because Page is null.' );
}
}
return $this->title;
Expand Down Expand Up @@ -240,7 +249,7 @@ public function getPage() {
if ( $this->title != null) {
$this->page = static::getPageObject( $this->title );
} else {
throw new Exception( 'Unable to create Page for this Source because Title is null.' );
throw new \Exception( 'Unable to create Page for this Source because Title is null.' );
}
}
return $this->page;
Expand Down
3 changes: 3 additions & 0 deletions includes/Splitter.php
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,9 @@ protected function __construct( Config $config) {
* @return Array of strings where even indexes point to linkable sections.
*/
public function split( &$text ) {
ini_set( 'pcre.jit', false );
ini_set( 'pcre.recursion_limit', "500000" );

return preg_split( $this->splitter, $text, -1, PREG_SPLIT_DELIM_CAPTURE );
}

Expand Down
33 changes: 23 additions & 10 deletions includes/Target.php
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@ class Target {

private $nsText;

private static $pagesWithMagicWord;
private static $pagesRedirects;

/**
* Constructs a new Target object
*
Expand All @@ -77,7 +80,6 @@ class Target {
*/
public function __construct( $namespace, $title, Config &$config ) {
$this->title = \Title::makeTitleSafe( $namespace, $title );
$this->titleValue = $this->title->getTitleValue();
$this->config = $config;

// Use unicode character properties rather than \b escape sequences
Expand Down Expand Up @@ -159,7 +161,7 @@ public function getCaseInsensitiveRegex() {
* @return String regular expression pattern
*/
private function buildRegex( $searchTerm ) {
return '/(?<![\:\.\@\/\?\&])' . $this->wordStart . $searchTerm . $this->wordEnd . '/S';
return '/(?<![\:\.\@\/\?\&])' . $this->wordStart . $searchTerm . $this->wordEnd . '/Su';
}

/**
Expand Down Expand Up @@ -217,9 +219,14 @@ public function mayLinkTo( Source $source ) {
// page does indeed contain this magic word, return the page title
// as-is (unlinked).
if ( $this->config->enableNoTargetMagicWord ) {
if ( $this->getContent()->matchMagicWord( \MediaWiki\MediaWikiServices::getInstance()->getMagicWordFactory()->get( 'MAG_LINKTITLES_NOTARGET' ) ) ) {
return false;
if (!isset(self::$pagesWithMagicWord[$this->getPrefixedTitleText()]))
{
self::$pagesWithMagicWord[$this->getPrefixedTitleText()] = false;
if ( $this->getContent() )
self::$pagesWithMagicWord[$this->getPrefixedTitleText()] = $this->getContent()->matchMagicWord( \MediaWiki\MediaWikiServices::getInstance()->getMagicWordFactory()->get( 'MAG_LINKTITLES_NOTARGET' ) );
}

return !self::$pagesWithMagicWord[$this->getPrefixedTitleText()];
};
return true;
}
Expand All @@ -239,14 +246,20 @@ public function isSameTitle( Source $source) {
* @return bool True if the target redirects to the source.
*/
public function redirectsTo( $source ) {
if ( $this->getContent() ) {
if ( version_compare( MW_VERSION, '1.38', '>=' ) ) {
$redirectTitle = $this->getContent()->getRedirectTarget();
} else {
$redirectTitle = $this->getContent()->getUltimateRedirectTarget();
if (!isset(self::$pagesRedirects[$this->getPrefixedTitleText()]))
{
self::$pagesRedirects[$this->getPrefixedTitleText()] = null;

if ( $this->getContent() ) {
if ( version_compare( MW_VERSION, '1.38', '>=' ) ) {
self::$pagesRedirects[$this->getPrefixedTitleText()] = $this->getContent()->getRedirectTarget();
} else {
self::$pagesRedirects[$this->getPrefixedTitleText()] = $this->getContent()->getUltimateRedirectTarget();
}
}
return $redirectTitle && $redirectTitle->equals( $source->getTitle() );
}

return self::$pagesRedirects[$this->getPrefixedTitleText()] && self::$pagesRedirects[$this->getPrefixedTitleText()]->equals( $source->getTitle() );
}

/**
Expand Down
60 changes: 43 additions & 17 deletions includes/Targets.php
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@
*/
class Targets {
private static $instance;

/**
* Stores a list of pages that have been linked into other pages
*/
private static $includedPages;

/**
* Singleton factory that returns a (cached) database query results with
Expand All @@ -41,12 +46,27 @@ class Targets {
* @param String $sourceNamespace The namespace of the current page.
* @param Config $config LinkTitles configuration.
*/
public static function singleton( \Title $title, Config $config ) {
public static function singleton( \Title $title, Config $config, $targetPageTitle = "" ) {
if ( ( self::$instance === null ) || ( self::$instance->sourceNamespace != $title->getNamespace() ) ) {
self::$instance = new Targets( $title, $config );
self::$instance = new Targets( $title, $config, $targetPageTitle );
}
return self::$instance;
}

public static function incrementTargetCount($pageTitle)
{
if (!isset(self::$includedPages[$pageTitle]))
self::$includedPages[$pageTitle] = 0;

self::$includedPages[$pageTitle]++;
}

public static function getTargetedPages() : Array {
if (empty(self::$includedPages))
self::$includedPages = [];

return self::$includedPages;
}

/**
* Invalidates the cache; the next call of Targets::singleton() will trigger
Expand Down Expand Up @@ -85,26 +105,39 @@ public static function invalidate() {
* The constructor is private to enforce using the singleton pattern.
* @param \Title $title
*/
private function __construct( \Title $title, Config $config) {
private function __construct( \Title $title, Config $config, $targetPageTitle = "" ) {
$this->config = $config;
$this->sourceNamespace = $title->getNamespace();
$this->fetch();
$this->fetch($targetPageTitle);
}

//
/**
* Fetches the page titles from the database.
*/
private function fetch() {
private function fetch($targetPageTitle = '') {
( $this->config->preferShortTitles ) ? $sortOrder = 'ASC' : $sortOrder = 'DESC';

$dbr = wfGetDB( DB_REPLICA );

$whereClauses = [
"page_content_model = 'wikitext'"
];

// Build a blacklist of pages that are not supposed to be link
// targets. This includes the current page.
if ( $this->config->blackList ) {
$blackList = 'page_title NOT IN ' .
$whereClauses[] = 'page_title NOT IN ' .
str_replace( ' ', '_', '("' . implode( '","', str_replace( '"', '\"', $this->config->blackList ) ) . '")' );
} else {
$blackList = null;
}

if ( !empty($targetPageTitle) ) {
$whereClauses[] = 'page_title LIKE ' . $dbr->addQuotes( $targetPageTitle );
}
else {
// Apply the min max lenght of the page titles:
$whereClauses[] = $this->charLength() . '(page_title) >= ' . $this->config->minimumTitleLength;
$whereClauses[] = $this->charLength() . '(page_title) <= ' . $this->config->maximumTitleLength;
}

if ( $this->config->sameNamespace ) {
Expand All @@ -129,23 +162,16 @@ private function fetch() {
$weightSelect = $weightSelect . " WHEN " . $namespaceValue . " THEN " . $currentWeight . PHP_EOL;
}
$weightSelect = $weightSelect . " END ";
$namespacesClause = '(' . implode( ', ', $namespaces ) . ')';
$whereClauses[] = 'page_namespace IN (' . implode( ', ', $namespaces ) . ')';

// Build an SQL query and fetch all page titles ordered by length from
// shortest to longest. Only titles from 'normal' pages (namespace uid
// = 0) are returned. Since the db may be sqlite, we need a try..catch
// structure because sqlite does not support the CHAR_LENGTH function.
$dbr = wfGetDB( DB_REPLICA );
$this->queryResult = $dbr->select(
'page',
array( 'page_title', 'page_namespace' , "weight" => $weightSelect),
array_filter(
array(
'page_namespace IN ' . $namespacesClause,
$this->charLength() . '(page_title) >= ' . $this->config->minimumTitleLength,
$blackList,
)
),
$whereClauses,
__METHOD__,
array( 'ORDER BY' => 'weight ASC, ' . $this->charLength() . '(page_title) ' . $sortOrder )
);
Expand Down
Loading