Skip to content

Commit

Permalink
#9911 Consider new URLs with language, and create canonical URLs in C…
Browse files Browse the repository at this point in the history
…onvertApacheAccessLogFile CLI tool
  • Loading branch information
bozana committed Aug 15, 2024
1 parent 0fb17c2 commit d784b46
Show file tree
Hide file tree
Showing 2 changed files with 158 additions and 39 deletions.
86 changes: 75 additions & 11 deletions classes/cliTool/traits/ConvertLogFile.php
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,11 @@
use APP\statistics\StatisticsHelper;
use DateTime;
use Exception;
use PKP\config\Config;
use PKP\core\Core;
use PKP\core\Registry;
use PKP\db\DAORegistry;
use PKP\facades\Locale;
use PKP\file\FileManager;
use PKP\submission\Genre;

Expand Down Expand Up @@ -169,7 +171,6 @@ public function convert(string $fileName): void
}

$newEntry['userAgent'] = $entryData['userAgent'];
$newEntry['canonicalUrl'] = $entryData['url'];

[
'workingAssocType' => $assocType,
Expand All @@ -188,13 +189,23 @@ public function convert(string $fileName): void
$context = $this->contextsByPath[$foundContextPath];
$newEntry['contextId'] = $context->getId();

$this->setAssoc($assocType, $op, $args, $newEntry);
// temporarily set the canonicalUrlPage that is needed in the child class
$newEntry['canonicalUrlPage'] = $page;
$this->setAssoc($assocType, $args, $newEntry);
if (!array_key_exists('assocType', $newEntry)) {
if (!$this->isApacheAccessLogFile()) {
fwrite(STDERR, "The URL {$entryData['url']} in the line number {$lineNumber} was not considered." . PHP_EOL);
}
continue;
}

$canonicalUrl = $entryData['url']; // if this is not the apache log file i.e. it is the internal log file, the URLs are already canonical
if ($this->isApacheAccessLogFile()) {
$canonicalUrl = $this->getCanonicalUrl($foundContextPath, $newEntry['canonicalUrlPage'], $newEntry['canonicalUrlOp'], $newEntry['canonicalUrlArgs'] ?? null);
}
$newEntry['canonicalUrl'] = $canonicalUrl;
// unset elements that are temporarily used and should not be logged
unset($newEntry['canonicalUrlPage'], $newEntry['canonicalUrlOp'], $newEntry['canonicalUrlArgs']);
} else {
continue;
}
Expand Down Expand Up @@ -408,12 +419,14 @@ protected function getExpectedPageAndOp(): array
break;
case 'omp':
// Before 3.4 OMP did not have chapter assoc type i.e. chapter landing page
// so no need to consider it here
// consider it here however, in order to allow current apache access log file conversion
$pageAndOp = $pageAndOp + [
Application::ASSOC_TYPE_SUBMISSION_FILE => [
'catalog/download'],
Application::ASSOC_TYPE_MONOGRAPH => [
'catalog/book'],
Application::ASSOC_TYPE_CHAPTER => [
'catalog/book'],
Application::ASSOC_TYPE_SERIES => [
'catalog/series']
];
Expand Down Expand Up @@ -478,8 +491,8 @@ protected static function getContextPaths(string $urlInfo, bool $isPathInfo): ar
*/
protected static function getPage(string $urlInfo, bool $isPathInfo): string
{
$page = self::getUrlComponents($urlInfo, $isPathInfo, 0, 'page');
return Core::cleanFileVar(is_null($page) ? '' : $page);
$page = self::getUrlComponents($urlInfo, $isPathInfo, self::getOffset($urlInfo, $isPathInfo, 0), 'page');
return Core::cleanFileVar($page ?? '');
}

/**
Expand All @@ -489,8 +502,8 @@ protected static function getPage(string $urlInfo, bool $isPathInfo): string
*/
protected static function getOp(string $urlInfo, bool $isPathInfo): string
{
$operation = self::getUrlComponents($urlInfo, $isPathInfo, 1, 'op');
return Core::cleanFileVar(empty($operation) ? 'index' : $operation);
$operation = self::getUrlComponents($urlInfo, $isPathInfo, self::getOffset($urlInfo, $isPathInfo, 1), 'op');
return Core::cleanFileVar($operation ?: 'index');
}

/**
Expand All @@ -501,14 +514,32 @@ protected static function getOp(string $urlInfo, bool $isPathInfo): string
*/
protected static function getArgs(string $urlInfo, bool $isPathInfo): array
{
return self::getUrlComponents($urlInfo, $isPathInfo, 2, 'path');
return self::getUrlComponents($urlInfo, $isPathInfo, self::getOffset($urlInfo, $isPathInfo, 2), 'path');
}

/**
* Get offset. Add 1 extra if localization present in URL
*/
private static function getOffset(string $urlInfo, bool $isPathInfo, int $varOffset): int
{
return $varOffset + (int) !!self::getLocalization($urlInfo, $isPathInfo);
}

/**
* Get localization path present into the passed
* url information.
*/
public static function getLocalization(string $urlInfo, bool $isPathInfo): string
{
$locale = self::getUrlComponents($urlInfo, $isPathInfo, 0);
return Locale::isLocaleValid($locale) ? $locale : '';
}

/**
* Get url components (page, operation and args)
* based on the passed offset.
*/
protected static function getUrlComponents(string $urlInfo, bool $isPathInfo, int $offset, string $varName = ''): mixed
protected static function getUrlComponents(string $urlInfo, bool $isPathInfo, int $offset, string $varName = ''): array|string|null
{
$component = null;

Expand All @@ -517,7 +548,6 @@ protected static function getUrlComponents(string $urlInfo, bool $isPathInfo, in
$isArrayComponent = true;
}
if ($isPathInfo) {
$application = Application::get();
$contextDepth = 1; // Was $application->getContextDepth();

$vars = explode('/', trim($urlInfo, '/'));
Expand All @@ -544,10 +574,44 @@ protected static function getUrlComponents(string $urlInfo, bool $isPathInfo, in
return $component;
}

/**
* Construct the URL from context path, page, op, and params
*/
protected function getCanonicalUrl(string $contextPath, string $canonicalUrlPage, string $canonicalUrlOp, array $canonicalUrlArgs = null): string
{
$canonicalUrl = Application::get()->getDispatcher()->url(
Application::get()->getRequest(),
Application::ROUTE_PAGE,
$contextPath,
$canonicalUrlPage,
$canonicalUrlOp,
$canonicalUrlArgs,
urlLocaleForPage: ''
);

// Make sure we log the server name and not aliases.
$configBaseUrl = Config::getVar('general', 'base_url');
$requestBaseUrl = Application::get()->getRequest()->getBaseUrl();
if ($requestBaseUrl !== $configBaseUrl) {
// Make sure it's not an url override (no alias on that case).
if (!in_array($requestBaseUrl, Config::getContextBaseUrls()) &&
$requestBaseUrl !== Config::getVar('general', 'base_url[index]')) {
// Alias found, replace it by base_url from config file.
// Make sure we use the correct base url override value for the context, if any.
$baseUrlReplacement = Config::getVar('general', 'base_url[' . $contextPath . ']');
if (!$baseUrlReplacement) {
$baseUrlReplacement = $configBaseUrl;
}
$canonicalUrl = str_replace($requestBaseUrl, $baseUrlReplacement, $canonicalUrl);
}
}
return $canonicalUrl;
}

/**
* Set assoc type and IDs from the passed page, operation and arguments.
*/
protected function setAssoc(int $assocType, string $op, array $args, array &$newEntry): void
protected function setAssoc(int $assocType, array $args, array &$newEntry): void
{
$application = Application::get();
$applicationName = $application->getName();
Expand Down
Loading

0 comments on commit d784b46

Please sign in to comment.