From bb37a50590c13f5886bbf1c58b9cdddd25efc5ac Mon Sep 17 00:00:00 2001 From: ruhrotht Date: Mon, 29 Jan 2024 09:00:05 +0100 Subject: [PATCH] 312 Refactor to plain java - taken from intermediate repositors. --- .../htmlsanitycheck/Configuration.groovy | 266 -------------- .../MisconfigurationException.groovy | 16 - .../check/BrokenCrossReferencesChecker.groovy | 148 -------- .../check/BrokenHttpLinksChecker.groovy | 251 -------------- .../htmlsanitycheck/check/Checker.groovy | 92 ----- .../check/CheckerCreator.groovy | 93 ----- .../check/DuplicateIdChecker.groovy | 101 ------ .../check/ImageMapChecker.groovy | 250 -------------- .../check/MissingAltInImageTagsChecker.groovy | 75 ---- .../check/MissingImageFilesChecker.groovy | 146 -------- .../check/MissingLocalResourcesChecker.groovy | 167 --------- .../check/SuggestingChecker.groovy | 95 ----- .../check/UnknownCheckerException.groovy | 33 -- .../collect/SingleCheckResults.groovy | 176 ---------- .../htmlsanitycheck/html/HtmlElement.groovy | 107 ------ .../htmlsanitycheck/html/HtmlPage.groovy | 325 ------------------ .../htmlsanitycheck/html/ParserSample.groovy | 96 ------ .../aim42/htmlsanitycheck/html/URLUtil.groovy | 187 ---------- .../report/CreateLinkUtil.groovy | 20 -- .../htmlsanitycheck/report/Reporter.groovy | 141 -------- .../report/SummarizerUtil.groovy | 59 ---- .../main/groovy/org/aim42/inet/NetUtil.groovy | 42 --- .../AscendingSimilarityScoreComparator.java | 48 +-- .../DescendingSimilarityScoreComparator.java | 47 +-- .../net/ricecode/similarity/JaroStrategy.java | 96 +++--- .../similarity/JaroWinklerStrategy.java | 66 ++-- .../ricecode/similarity/SimilarityScore.java | 125 +++---- .../similarity/SimilarityStrategy.java | 20 +- .../similarity/StringSimilarityService.java | 38 +- .../StringSimilarityServiceImpl.java | 72 ++-- .../aim42/htmlsanitycheck/Configuration.java | 319 +++++++++++++++++ .../MisconfigurationException.java | 35 ++ .../htmlsanitycheck/ProductVersion.java} | 18 +- .../htmlsanitycheck/check/AllCheckers.java} | 31 +- .../check/BrokenCrossReferencesChecker.java | 132 +++++++ .../check/BrokenHttpLinksChecker.java | 246 +++++++++++++ .../aim42/htmlsanitycheck/check/Checker.java | 73 ++++ .../htmlsanitycheck/check/CheckerCreator.java | 62 ++++ .../check/DuplicateIdChecker.java | 84 +++++ .../check/ImageMapChecker.java | 186 ++++++++++ .../check/MissingAltInImageTagsChecker.java | 48 +++ .../check/MissingImageFilesChecker.java | 128 +++++++ .../check/MissingLocalResourcesChecker.java | 123 +++++++ .../check/SuggestingChecker.java | 79 +++++ .../check/UnknownCheckerException.java | 11 + .../collect/CheckResults.java} | 8 +- .../htmlsanitycheck/collect/Finding.java} | 68 ++-- .../htmlsanitycheck/collect/PageResults.java} | 15 +- .../collect/PerRunResults.java} | 75 ++-- .../htmlsanitycheck/collect/RunResults.java} | 14 +- .../collect/SingleCheckResults.java | 207 +++++++++++ .../collect/SinglePageResults.java} | 65 ++-- .../htmlsanitycheck/html/HtmlConst.java} | 6 +- .../htmlsanitycheck/html/HtmlElement.java | 92 +++++ .../aim42/htmlsanitycheck/html/HtmlPage.java | 274 +++++++++++++++ .../report/CreateLinkUtil.java | 17 + .../htmlsanitycheck/report/Reporter.java | 119 +++++++ .../report/SummarizerUtil.java | 38 ++ .../htmlsanitycheck/suggest/Suggester.java} | 26 +- .../tools/TrustAllCertificates.java | 60 ++++ .../org/aim42/htmlsanitycheck/tools/Web.java | 290 ++++++++++++++++ .../org/aim42/net/TrustAllCertificates.java | 43 ++- .../check/BrokenHttpLinksCheckerSpec.groovy | 4 +- .../htmlsanitycheck/html/HtmlPageTest.groovy | 3 +- .../htmlsanitycheck/html/URLUtilSpec.groovy | 9 +- .../htmlsanitycheck/html/URLUtilTest.groovy | 23 +- .../report/SummarizerUtilSpec.groovy | 1 + .../groovy/org/aim42/inet/NetUtilSpec.groovy | 7 +- src/docs/arc42/chapters/_config.adoc | 2 +- .../chapters/chap-05-BuildingBlocks.adoc | 6 +- src/docs/development/issue-252.adoc | 2 +- 71 files changed, 3106 insertions(+), 3341 deletions(-) delete mode 100644 htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/Configuration.groovy delete mode 100644 htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/MisconfigurationException.groovy delete mode 100644 htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/BrokenCrossReferencesChecker.groovy delete mode 100644 htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/BrokenHttpLinksChecker.groovy delete mode 100644 htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/Checker.groovy delete mode 100644 htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/CheckerCreator.groovy delete mode 100644 htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/DuplicateIdChecker.groovy delete mode 100644 htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/ImageMapChecker.groovy delete mode 100644 htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/MissingAltInImageTagsChecker.groovy delete mode 100644 htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/MissingImageFilesChecker.groovy delete mode 100644 htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/MissingLocalResourcesChecker.groovy delete mode 100644 htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/SuggestingChecker.groovy delete mode 100644 htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/UnknownCheckerException.groovy delete mode 100644 htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/collect/SingleCheckResults.groovy delete mode 100644 htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/html/HtmlElement.groovy delete mode 100644 htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/html/HtmlPage.groovy delete mode 100644 htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/html/ParserSample.groovy delete mode 100644 htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/html/URLUtil.groovy delete mode 100644 htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/report/CreateLinkUtil.groovy delete mode 100644 htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/report/Reporter.groovy delete mode 100644 htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/report/SummarizerUtil.groovy delete mode 100644 htmlSanityCheck-core/src/main/groovy/org/aim42/inet/NetUtil.groovy create mode 100644 htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/Configuration.java create mode 100644 htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/MisconfigurationException.java rename htmlSanityCheck-core/src/main/{groovy/org/aim42/htmlsanitycheck/ProductVersion.groovy => java/org/aim42/htmlsanitycheck/ProductVersion.java} (82%) rename htmlSanityCheck-core/src/main/{groovy/org/aim42/htmlsanitycheck/check/AllCheckers.groovy => java/org/aim42/htmlsanitycheck/check/AllCheckers.java} (53%) create mode 100644 htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/check/BrokenCrossReferencesChecker.java create mode 100644 htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/check/BrokenHttpLinksChecker.java create mode 100644 htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/check/Checker.java create mode 100644 htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/check/CheckerCreator.java create mode 100644 htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/check/DuplicateIdChecker.java create mode 100644 htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/check/ImageMapChecker.java create mode 100644 htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/check/MissingAltInImageTagsChecker.java create mode 100644 htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/check/MissingImageFilesChecker.java create mode 100644 htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/check/MissingLocalResourcesChecker.java create mode 100644 htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/check/SuggestingChecker.java create mode 100644 htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/check/UnknownCheckerException.java rename htmlSanityCheck-core/src/main/{groovy/org/aim42/htmlsanitycheck/collect/CheckResults.groovy => java/org/aim42/htmlsanitycheck/collect/CheckResults.java} (89%) rename htmlSanityCheck-core/src/main/{groovy/org/aim42/htmlsanitycheck/collect/Finding.groovy => java/org/aim42/htmlsanitycheck/collect/Finding.java} (57%) rename htmlSanityCheck-core/src/main/{groovy/org/aim42/htmlsanitycheck/collect/PageResults.groovy => java/org/aim42/htmlsanitycheck/collect/PageResults.java} (81%) rename htmlSanityCheck-core/src/main/{groovy/org/aim42/htmlsanitycheck/collect/PerRunResults.groovy => java/org/aim42/htmlsanitycheck/collect/PerRunResults.java} (67%) rename htmlSanityCheck-core/src/main/{groovy/org/aim42/htmlsanitycheck/collect/RunResults.groovy => java/org/aim42/htmlsanitycheck/collect/RunResults.java} (81%) create mode 100644 htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/collect/SingleCheckResults.java rename htmlSanityCheck-core/src/main/{groovy/org/aim42/htmlsanitycheck/collect/SinglePageResults.groovy => java/org/aim42/htmlsanitycheck/collect/SinglePageResults.java} (61%) rename htmlSanityCheck-core/src/main/{groovy/org/aim42/htmlsanitycheck/html/HtmlConst.groovy => java/org/aim42/htmlsanitycheck/html/HtmlConst.java} (87%) create mode 100644 htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/html/HtmlElement.java create mode 100644 htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/html/HtmlPage.java create mode 100644 htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/report/CreateLinkUtil.java create mode 100644 htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/report/Reporter.java create mode 100644 htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/report/SummarizerUtil.java rename htmlSanityCheck-core/src/main/{groovy/org/aim42/htmlsanitycheck/suggest/Suggester.groovy => java/org/aim42/htmlsanitycheck/suggest/Suggester.java} (73%) create mode 100644 htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/tools/TrustAllCertificates.java create mode 100644 htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/tools/Web.java diff --git a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/Configuration.groovy b/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/Configuration.groovy deleted file mode 100644 index 98a5bbb0..00000000 --- a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/Configuration.groovy +++ /dev/null @@ -1,266 +0,0 @@ -package org.aim42.htmlsanitycheck - - -import org.aim42.htmlsanitycheck.check.AllCheckers -import org.aim42.inet.NetUtil - -// see end-of-file for license information - -/** - * Handles (and can verify) configuration options. - * - * Implemented as REGISTRY pattern - * - * - * Explanation for configuring http status codes: - * The standard http status codes are defined in class @link NetUtil and can - * be overwritten by configuration: - * - * Example: You want 503 to be ok instead of error: - * httpSuccessCodes = [503] - * - * During configuration initialization, the value(s) of httpSuccessCodes will be: - * 1.) set-added to httpSuccessCodes, - * 2.) set-subtracted from the warnings and errors. - * - * - * This class needs to be updated if additional configuration options are added. - * - * - * Ideas for additional config options: - * ------------------------------------ - * - verbosity level on console during checks - * - * - */ - -class Configuration { - - /***************************************** - * configuration item names - * - * NEVER use any string constants for configuration - * item names within source code! - ****************************************/ - - // sourceDocuments is a collection of Strings, maybe only a single String - final static String ITEM_NAME_sourceDocuments = "sourceDocuments" - final static String ITEM_NAME_sourceDir = "sourceDir" - - final static String ITEM_NAME_checkingResultsDir = "checkingResultsDir" - final static String ITEM_NAME_junitResultsDir = "junitResultsDir" - - final static String ITEM_NAME_consoleReport = "consoleReport" - - // e.g. for Gradle based builds: fail the build if errors are found in Html file(s) - final static String ITEM_NAME_failOnErrors = "failOnErrors" - - // in case of slow internet connections, this timeout might be helpful - final static String ITEM_NAME_httpConnectionTimeout = "httpConnectionTimeout" - - // if (ignoreLocalhost == false) localhost-based URLs are marked as "Warning" - final static String ITEM_NAME_ignoreLocalhost = "ignoreLocalHost" - - // if (ignoreIPAddresses) then urls with numeric IP addresses are marked as "Warning" - final static String ITEM_NAME_ignoreIPAddresses = "ignoreIPAddresses" - - final static String ITEM_NAME_httpWarningCodes = "httpWarningCodes" - final static String ITEM_NAME_httpErrorCodes = "httpErrorCodes" - final static String ITEM_NAME_httpSuccessCodes = "httpSuccessCodes" - - final static String ITEM_NAME_urlsToExclude = "urlsToExclude" - final static String ITEM_NAME_hostsToExclude = "hostsToExclude" - - // extensions to be tried for noExtensionHrefs (see #252, MissingLocalResourcesChecker) - final static String ITEM_NAME_prefixOnlyHrefExtensions = "prefixOnlyHrefExtensions" - - final static String ITEM_NAME_checksToExecute = "checksToExecute" - - /*************************** - * private member - **************************/ - private Map configurationItems = [:] - - - // constructor to set (some) default values - Configuration() { - - this.configurationItems.put(ITEM_NAME_httpErrorCodes, NetUtil.HTTP_ERROR_CODES) - this.configurationItems.put(ITEM_NAME_httpSuccessCodes, NetUtil.HTTP_SUCCESS_CODES) - this.configurationItems.put(ITEM_NAME_httpWarningCodes, NetUtil.HTTP_WARNING_CODES) - - this.configurationItems.put(ITEM_NAME_httpConnectionTimeout, 5000) // 5 secs as default timeout - this.configurationItems.put(ITEM_NAME_ignoreIPAddresses, false) // warning if numerical IP addresses - this.configurationItems.put(ITEM_NAME_ignoreLocalhost, false) // warning if localhost-URLs - - this.configurationItems.put(ITEM_NAME_prefixOnlyHrefExtensions, NetUtil.POSSIBLE_EXTENSIONS) - - this.configurationItems.put(ITEM_NAME_checksToExecute, AllCheckers.checkerClazzes) - } - - /** retrieve a single configuration item - * - * @param itemName - * @return - */ - synchronized Object getConfigItemByName(final String itemName) { - return configurationItems.get(itemName) - } - - // special HtmlSanityChecker methods for mandatory configuration items - // ******************************************************************* - - /** - * convenience method for simplified testing - */ - synchronized void addSourceFileConfiguration(File srcDir, Collection srcDocs) { - addConfigurationItem(ITEM_NAME_sourceDir, srcDir) - addConfigurationItem(ITEM_NAME_sourceDocuments, srcDocs) - } - - /** - * @return true if item is already present, false otherwise - */ - boolean checkIfItemPresent(String itemName) { - boolean result = false - if (configurationItems.get(itemName) != null) { - result = true - } - return result - } - - /** - * @return the number of configuration items - */ - int nrOfConfigurationItems() { - return configurationItems.size() - } - - /** add a single configuration item, unless its value is null - * - * @param itemName - * @param itemValue - */ - void addConfigurationItem(String itemName, Object itemValue) { - if (itemValue != null) { - configurationItems.put(itemName, itemValue) - } - } - - /** - * overwrites httpSuccessCodes configuration - */ - void overwriteHttpSuccessCodes(Collection additionalSuccessCodes) { - def errCodes = getConfigItemByName(Configuration.ITEM_NAME_httpErrorCodes) - def warnCodes = getConfigItemByName(Configuration.ITEM_NAME_httpWarningCodes) - def successCodes = getConfigItemByName(Configuration.ITEM_NAME_httpSuccessCodes) - - additionalSuccessCodes.each { code -> - successCodes += code // add to success codes - errCodes -= code // the new success code cannot be error code any longer - warnCodes -= code // neither warning - } - - updateSuccessWarningErrorCodesConfiguration(errCodes, warnCodes, successCodes) - } - - /** - * overwrites httpWarningCodes configuration - */ - void overwriteHttpWarningCodes(Collection additionalWarningCodes) { - def errCodes = getConfigItemByName(Configuration.ITEM_NAME_httpErrorCodes) - def warnCodes = getConfigItemByName(Configuration.ITEM_NAME_httpWarningCodes) - def successCodes = getConfigItemByName(Configuration.ITEM_NAME_httpSuccessCodes) - - additionalWarningCodes.each { code -> - warnCodes += code // add to warning codes - successCodes -= code // remove from success codes - errCodes -= code // and remove from error codes - } - - updateSuccessWarningErrorCodesConfiguration(errCodes, warnCodes, successCodes) - } - - /** - * overwrites httpErrorCodes configuration - */ - void overwriteHttpErrorCodes(Collection additionalErrorCodes) { - def errCodes = getConfigItemByName(Configuration.ITEM_NAME_httpErrorCodes) - def warnCodes = getConfigItemByName(Configuration.ITEM_NAME_httpWarningCodes) - def successCodes = getConfigItemByName(Configuration.ITEM_NAME_httpSuccessCodes) - - additionalErrorCodes.each { code -> - errCodes += code // add to error codes - successCodes -= code - warnCodes -= code - } - - updateSuccessWarningErrorCodesConfiguration(errCodes, warnCodes, successCodes) - } - - - void updateSuccessWarningErrorCodesConfiguration(errCodes, warnCodes, successCodes) { - addConfigurationItem(Configuration.ITEM_NAME_httpErrorCodes, errCodes) - addConfigurationItem(Configuration.ITEM_NAME_httpWarningCodes, warnCodes) - addConfigurationItem(Configuration.ITEM_NAME_httpSuccessCodes, successCodes) - } - - /** - * overwrites prefixOnlyHrefExtensions - */ - void overwritePrefixOnlyHrefExtensions( Collection prefixesToBeConsidered ) { - addConfigurationItem( Configuration.ITEM_NAME_prefixOnlyHrefExtensions, prefixesToBeConsidered) - } - - - /** - * checks plausibility of configuration: - * We need at least one html file as input, maybe several - * @param configuration instance - * - * srcDocs needs to be of type {@link FileCollection} - * to be Gradle-compliant - */ - Boolean isValid() { - - // we need at least srcDir and srcDocs!! - File srcDir = getConfigItemByName(Configuration.ITEM_NAME_sourceDir) - Set srcDocs = getConfigItemByName(Configuration.ITEM_NAME_sourceDocuments) - - // cannot check if source director is null (= unspecified) - if ((srcDir == null)) { - throw new MisconfigurationException("source directory must not be null") - } - - if ((!srcDir.exists())) { - throw new MisconfigurationException("given sourceDir $srcDir does not exist.") - } - - // cannot check if both input params are null - if (srcDocs == null) { - throw new MisconfigurationException("source documents must not be null") - } - - // empty SrcDocs - if (srcDocs.empty) { - throw new MisconfigurationException("source documents must not be empty") - } - - Object checksToExecute = getConfigItemByName(Configuration.ITEM_NAME_checksToExecute) - if (!(checksToExecute instanceof Collection) || !checksToExecute) { - throw new MisconfigurationException("checks to execute have to be a non empty collection") - } - - // if no exception has been thrown until now, - // the configuration seems to be valid.. - return true - } - - - @Override - String toString() { - return "Configuration{" + - "configurationItems=" + configurationItems + - '}'; - } -} diff --git a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/MisconfigurationException.groovy b/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/MisconfigurationException.groovy deleted file mode 100644 index ebbdd231..00000000 --- a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/MisconfigurationException.groovy +++ /dev/null @@ -1,16 +0,0 @@ -package org.aim42.htmlsanitycheck - -// see end-of-file for license information - - -class MisconfigurationException extends Exception { - - - public MisconfigurationException( String message, File srcDir) { - super( message + ": " + srcDir.canonicalPath ) - } - - public MisconfigurationException( String message ) { - super(message) - } -} diff --git a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/BrokenCrossReferencesChecker.groovy b/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/BrokenCrossReferencesChecker.groovy deleted file mode 100644 index 569dc717..00000000 --- a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/BrokenCrossReferencesChecker.groovy +++ /dev/null @@ -1,148 +0,0 @@ -package org.aim42.htmlsanitycheck.check - -import org.aim42.htmlsanitycheck.Configuration -import org.aim42.htmlsanitycheck.collect.SingleCheckResults -import org.aim42.htmlsanitycheck.html.HtmlPage -import org.aim42.htmlsanitycheck.html.URLUtil - -// see end-of-file for license information - - -class BrokenCrossReferencesChecker extends SuggestingChecker { - - private List listOfIds // id="XYZ" - private List hrefList - private Set hrefSet - - BrokenCrossReferencesChecker(Configuration pConfig) { - super(pConfig) - } // - - @Override - protected void initCheckingResultsDescription() { - checkingResults.whatIsChecked = "Broken Internal Links Check" - checkingResults.sourceItemName = "href" - checkingResults.targetItemName = "missing id" - } - - @Override - /** - set valid possibilities, where suggester can choose from. - Here: List of (internal) id's, meaning link-targets. - */ - protected void setValidPossibilities() { - validPossibilities = listOfIds - } - - @Override - protected SingleCheckResults check(final HtmlPage pageToCheck) { - //get list of all a-tags " - //if (URLUtil.isValidURL(href)) - checkSingleInternalLink(href) - } - } - - /** - * check a single internal link (href) against the existing id's within - * the html document - */ - private void checkSingleInternalLink(String href) { - checkingResults.incNrOfChecks() - if (URLUtil.containsInvalidChars(href)) { - // we found link with illegal characters! - String findingText = "link \"$href\" contains illegal characters" - // now count occurrences - how often is it referenced - int nrOfReferences = countNrOfReferences(href) - if (nrOfReferences > 1) { - findingText += ", reference count: $nrOfReferences" - } - checkingResults.newFinding(findingText, nrOfReferences) - } else - // we check only cross-references, that means we exclude - // remote-urls and references to local files - if (URLUtil.isCrossReference(href)) { - - // bookkeeping: - checkingResults.incNrOfChecks() - - doesLinkTargetExist(href) - } - } - - /** - * check if the id for the href parameter exists - * - * @param href = "#XYZ" in id="XYZ" - * */ - private void doesLinkTargetExist(String href) { - if (href == '#') { - return - } - - // strip href of its leading "#" - String linkTarget = (href.startsWith("#")) ? href[1..-1] : href - // fragment can be URL-encoded - linkTarget = URLDecoder.decode(linkTarget, 'UTF-8') - - if (!listOfIds.contains(linkTarget)) { - // we found a broken link! - addBrokenLinkToResults(linkTarget, href) - } - } - - /** - * bookkeeping the broken links that we found - */ - private void addBrokenLinkToResults(String linkTarget, String href) { - String findingText = "link target \"$linkTarget\" missing" - - // now count occurrences - how often is it referenced - int nrOfReferences = countNrOfReferences(href) - if (nrOfReferences > 1) { - findingText += ", reference count: $nrOfReferences" - } - - // determine suggestions "what could have been meant?" - - checkingResults.newFinding(findingText, nrOfReferences) - } - - private int countNrOfReferences(String href) { - int nrOfReferences = hrefList.findAll { it == href }.size() - return nrOfReferences - } - -} -/*======================================================================== - Copyright Gernot Starke and aim42 contributors - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an - "AS IS" BASIS,WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, - either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - ========================================================================*/ diff --git a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/BrokenHttpLinksChecker.groovy b/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/BrokenHttpLinksChecker.groovy deleted file mode 100644 index 1fee2c9a..00000000 --- a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/BrokenHttpLinksChecker.groovy +++ /dev/null @@ -1,251 +0,0 @@ -package org.aim42.htmlsanitycheck.check - -import org.aim42.htmlsanitycheck.Configuration -import org.aim42.htmlsanitycheck.collect.Finding -import org.aim42.htmlsanitycheck.collect.SingleCheckResults -import org.aim42.htmlsanitycheck.html.HtmlElement -import org.aim42.htmlsanitycheck.html.HtmlPage -import org.aim42.inet.NetUtil -import org.aim42.net.TrustAllCertificates - - -/** - * Check html anchor href attributes - * see https://www.w3schools.com/tags/att_a_href.asp - * - */ -class BrokenHttpLinksChecker extends Checker { - - // all href attributes with http(s) protocol, - // including potential duplicates - // need that to calculate "nrOfOccurrences" - private List hrefList - - // the pure http/https-hrefs a set, duplicates are removed here - private Set hrefSet - - // get the (configured) statusCodes, just syntactic sugar... - private final Collection successCodes - private final Collection warningCodes - private final Collection errorCodes - - - - BrokenHttpLinksChecker(Configuration pConfig) { - super(pConfig) - - successCodes = myConfig.getConfigItemByName(Configuration.ITEM_NAME_httpSuccessCodes) - warningCodes = myConfig.getConfigItemByName(Configuration.ITEM_NAME_httpWarningCodes) - errorCodes = myConfig.getConfigItemByName(Configuration.ITEM_NAME_httpErrorCodes) - - } - - @Override - protected void initCheckingResultsDescription() { - checkingResults.whatIsChecked = "External Links Check" - checkingResults.sourceItemName = "anchor href attribute" - checkingResults.targetItemName = "broken external link" - } - - @Override - protected SingleCheckResults check(final HtmlPage pageToCheck) { - - //get set of all a-tags " - doubleCheckSingleHttpLink(href) - } - } - - /** - * Double-Check a single http(s) link: - * Some servers don't accept head request and send errors like 403 or 405, - * instead of 200. - * Therefore we double-check: in case of errors or warnings, - * we try again with a GET, to get the "finalResponseCode" - - * which we then categorize as success, error or warning - */ - - - protected void doubleCheckSingleHttpLink(String href) { - - // to create appropriate error messages - String problem - - // bookkeeping: - checkingResults.incNrOfChecks() - - try { - URL url = new URL(href) - - // check if localhost-URL - checkIfLocalhostURL(url, href) - - // check if (numerical) IP address - checkIfIPAddress(url, href) - - try { - HttpURLConnection firstConnection = getNewURLConnection(url) - - // try to connect - firstConnection.connect() - int responseCode = firstConnection.getResponseCode() - - // issue 218 and 219: some web servers respond with 403 or 405 - // when given HEAD requests. Therefore, try GET - if (responseCode in successCodes) return - - // issue 244: special case for redirects - // thanx to https://stackoverflow.com/questions/39718059/read-from-url-in-groovy-with-redirect - else if (responseCode in [301, 302, 303, 307, 308]) { - String newLocation - if (firstConnection.headerFields.'Location') { - newLocation = firstConnection.headerFields.Location.first() - - problem = """Warning: ${href} returned statuscode ${responseCode}, new location: $newLocation""" - checkingResults.addFinding(new Finding(problem)) - - } - } - // in case of errors or warnings, - // try again with GET. - - else { - HttpURLConnection secondConnection = getNewURLConnection(url) - secondConnection.setRequestMethod("GET") - int finalResponseCode = secondConnection.getResponseCode() - secondConnection.disconnect() - - switch (finalResponseCode) { - case successCodes: return - case warningCodes: problem = "Warning:"; break - case errorCodes: problem = "Error:"; break - default: problem = "Error: Unknown or unclassified response code:" - } - - problem += """ ${href} returned statuscode ${responseCode}.""" - - checkingResults.addFinding(new Finding(problem)) - - } // else - - // cleanup firstConnection - firstConnection.disconnect() - - } - catch (UnknownHostException) { - Finding unknownHostFinding = new Finding( """Unknown host with href=$href""") - checkingResults.addFinding( unknownHostFinding) - } - catch (InterruptedIOException | ConnectException | IOException exception) { - Finding someException = new Finding("""exception ${exception.toString()} with href=${href}""") - checkingResults.addFinding(someException) - } - } - catch (MalformedURLException exception) { - Finding malformedURLFinding = new Finding("""malformed URL exception with href=${href}""") - checkingResults.addFinding(malformedURLFinding) - } - } - - - private HttpURLConnection getNewURLConnection(URL url) { - - TrustAllCertificates.install() - - HttpURLConnection connection = (HttpURLConnection) url.openConnection(); - connection.setRequestMethod("HEAD"); - - // httpConnectionTimeout is a configuration parameter - // that defaults to 5000 (msec) - connection.setConnectTimeout( - myConfig?.getConfigItemByName(Configuration.ITEM_NAME_httpConnectionTimeout) - ) - - // to avoid nasty 403 errors (forbidden), we set a referrer and user-agent - // - connection.setRequestProperty("Referer", "https://aim42.org"); - connection.setRequestProperty("User-Agent", "Mozilla/5.0 (X11; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0"); - - // TODO followRedirects should be a configuration parameter - // that defaults to false - - return connection - } - - // if configured, ip addresses in URLs yield warnings - private void checkIfIPAddress(URL url, String href) { - if (!myConfig.getConfigItemByName(Configuration.ITEM_NAME_ignoreIPAddresses)) { - String host = url.getHost() - - if (host.matches("\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}")) { - Finding localhostWarning = new Finding("""Warning: numerical urls (ip address) indicates suspicious environment dependency: href=${ - href - }""") - checkingResults.addFinding(localhostWarning) - } - } - } - - // if configured ,localhost-URLs yield warnings! - private void checkIfLocalhostURL(URL url, String href) { - if (!myConfig.getConfigItemByName(Configuration.ITEM_NAME_ignoreLocalhost)) { - String host = url.getHost() - if ((host == "localhost") || host.startsWith("127.0.0")) { - Finding localhostWarning = new Finding("""Warning: localhost urls indicates suspicious environment dependency: href=${ - href - }""") - checkingResults.addFinding(localhostWarning) - } - } - } - - -} - -/************************************************************************ - * This is free software - without ANY guarantee! - * - * - * Copyright Dr. Gernot Starke, arc42.org - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - *********************************************************************** */ - diff --git a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/Checker.groovy b/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/Checker.groovy deleted file mode 100644 index d0fe08fd..00000000 --- a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/Checker.groovy +++ /dev/null @@ -1,92 +0,0 @@ -package org.aim42.htmlsanitycheck.check - -import org.aim42.htmlsanitycheck.Configuration -import org.aim42.htmlsanitycheck.collect.SingleCheckResults -import org.aim42.htmlsanitycheck.html.HtmlPage - - -// see end-of-file for license information - - -/** - * Base class for the different concrete checkers (i.e. ImageChecker), - * following the template-method-pattern. - * - * No constructor is defined, allowing for arbitrary "named parameters" - * in constructor calls. - * - * While checking, every subclass builds an instance of {@link SingleCheckResults} - * - * @author Gernot Starke - */ - -abstract class Checker { - - // temporarily keep results of a single check, ie missing-images, - // internal cross-references etc. - SingleCheckResults checkingResults - - - // keep your own configuration - Configuration myConfig - - public Checker ( Configuration pConfig ) { - this.myConfig = pConfig - } - - // tag::performCheckTemplateMethod[] - /** - ** template method for performing a single type of checks on the given @see HtmlPage. - * - * Prerequisite: pageToCheck has been successfully parsed, - * prior to constructing this Checker instance. - **/ - public SingleCheckResults performCheck( final HtmlPage pageToCheck) { - // assert non-null htmlPage - assert pageToCheck != null - - checkingResults = new SingleCheckResults() - - // description is set by subclasses - initCheckingResultsDescription() - - return check( pageToCheck ) // <1> delegate check() to subclass - } - // end::performCheckTemplateMethod[] - - - /** - * Initialize with suitable description. - * - */ - abstract protected void initCheckingResultsDescription() - - - /** - * Perform a particular kind of checks, i.e. missing-local-images-check - * - * Called by {@link #performCheck()} as part of the template method pattern. - * @return collected results of this Checker instance - */ - abstract protected SingleCheckResults check( final HtmlPage pageToCheck ) - - -} - - -/*======================================================================== - Copyright Gernot Starke and aim42 contributors - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an - "AS IS" BASIS,WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, - either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - ========================================================================*/ diff --git a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/CheckerCreator.groovy b/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/CheckerCreator.groovy deleted file mode 100644 index e5d7fc93..00000000 --- a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/CheckerCreator.groovy +++ /dev/null @@ -1,93 +0,0 @@ -package org.aim42.htmlsanitycheck.check - -import org.aim42.htmlsanitycheck.Configuration -import org.slf4j.Logger -import org.slf4j.LoggerFactory - -/** abstract factory to create Checker instances - * - */ - -class CheckerCreator { - - private final static Logger logger = LoggerFactory.getLogger(CheckerCreator.class) - - - public static ArrayList createCheckerClassesFrom( final Collection checkerClasses, - final Configuration pConfig) { - - ArrayList checkers = new LinkedHashSet(checkerClasses.size()) - - checkerClasses.each { checkerClass -> - checkers.add(CheckerCreator.createSingleChecker(checkerClass, pConfig )) - - } - - return checkers - - } - - - public static Checker createSingleChecker(final Class checkerClass, final Configuration pConfig ) { - Checker checker - - // switch over all possible Checker classes - // in case of new Checkers, this has to be adapted, - // as Checker constructors will differ in minor details! - - // clearly violates the open-close principle - - switch (checkerClass) { - case BrokenCrossReferencesChecker.class: - checker = new BrokenCrossReferencesChecker( pConfig); break - - case BrokenHttpLinksChecker.class: - checker = new BrokenHttpLinksChecker( pConfig ); break - - case DuplicateIdChecker.class: - checker = new DuplicateIdChecker( pConfig ); break - - case ImageMapChecker.class: - checker = new ImageMapChecker(pConfig); break - - case MissingAltInImageTagsChecker.class: - checker = new MissingAltInImageTagsChecker(pConfig); break - - case MissingImageFilesChecker.class: - checker = new MissingImageFilesChecker(pConfig); break - - case MissingLocalResourcesChecker.class: - checker = new MissingLocalResourcesChecker(pConfig); break - - - default: - logger.warn("unknown Checker ${checkerClass.toString()}") - throw new UnknownCheckerException(checkerClass.toString()) - - } - - return checker - - } -} - -/************************************************************************ - * This is free software - without ANY guarantee! - * - * - * Copyright Dr. Gernot Starke, arc42.org - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - *********************************************************************** */ - diff --git a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/DuplicateIdChecker.groovy b/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/DuplicateIdChecker.groovy deleted file mode 100644 index 3bcee382..00000000 --- a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/DuplicateIdChecker.groovy +++ /dev/null @@ -1,101 +0,0 @@ -package org.aim42.htmlsanitycheck.check - -import org.aim42.htmlsanitycheck.Configuration -import org.aim42.htmlsanitycheck.collect.SingleCheckResults -import org.aim42.htmlsanitycheck.html.HtmlElement -import org.aim42.htmlsanitycheck.html.HtmlPage - - -// see end-of-file for license information - -class DuplicateIdChecker extends Checker { - - // the pure Id's as a set (duplicates are already removed here) - // we take this set as basis for our checks! - Set idStringsSet - - // all html-tags containing ids including potential duplicates - List idStringsList - - DuplicateIdChecker(Configuration pConfig) { - super(pConfig) - } - - @Override - protected void initCheckingResultsDescription() { - checkingResults.whatIsChecked = "Duplicate Definition of id Check" - checkingResults.sourceItemName = "id" - checkingResults.targetItemName = "duplicate id" - } - - - @Override - protected SingleCheckResults check( final HtmlPage pageToCheck) { - - //get list of all tagsWithId '<... id="XYZ"...' in html file - - idStringsList = pageToCheck.getAllIdStrings() - idStringsSet = idStringsList.toSet() - - checkForDuplicateIds( idStringsSet ) - - return checkingResults - - } - - /* - * iterate over all id's to check for duplicate definitions - */ - private void checkForDuplicateIds( Set idStringsSet ) { - - idStringsSet.each { oneIdString -> - checkForDuplicateDefinition( oneIdString ) - } - - } - - - private void checkForDuplicateDefinition(String idString) { - checkingResults.incNrOfChecks() - - int nrOfOccurrences = idStringsList.findAll{ it == idString}.size() - - // duplicate, IFF idString appears more than once in idStringsList - if (nrOfOccurrences > 1) { - - checkingResults.newFinding( "id \"$idString\" has $nrOfOccurrences definitions." ) - } - } - - - /** - * find all tags with specific id value - * @param id - * @param allTags List of tags containing id-attribute - */ - public static List getAllTagsWithSpecificId( String idString, - List allTags ) { - return allTags.findAll { htmlElement -> - htmlElement.idAttribute == idString - } - } -} - - -/*===================================================================== - Copyright Gernot Starke and aim42 contributors - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an - "AS IS" BASIS,WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, - either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - =====================================================================*/ - diff --git a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/ImageMapChecker.groovy b/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/ImageMapChecker.groovy deleted file mode 100644 index 677ed0d7..00000000 --- a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/ImageMapChecker.groovy +++ /dev/null @@ -1,250 +0,0 @@ -package org.aim42.htmlsanitycheck.check - -import org.aim42.htmlsanitycheck.Configuration -import org.aim42.htmlsanitycheck.collect.Finding -import org.aim42.htmlsanitycheck.collect.SingleCheckResults -import org.aim42.htmlsanitycheck.html.HtmlElement -import org.aim42.htmlsanitycheck.html.HtmlPage -import org.aim42.htmlsanitycheck.html.URLUtil - -/** - * principal checks on imageMap usage: - * - 1.) for every usemap-reference there is one map - 2.) every map is referenced by at least one image - 3.) every every map name is unique - 4.) every area-tag has one non-empty href attribute - 5.) every href points to valid target (broken-links check) - * - * see also: http://www.w3schools.com/tags/tag_map.asp - **/ -class ImageMapChecker extends Checker { - - private ArrayList maps - private ArrayList mapNames // y with - - private ArrayList imagesWithUsemapRefs - private ArrayList usemapRefs // x with referenced by - - private ArrayList listOfIds - - private String findingText - - private HtmlPage pageToCheck - - ImageMapChecker(Configuration pConfig) { - super(pConfig) - } - - @Override - protected void initCheckingResultsDescription() { - checkingResults.whatIsChecked = "Consistency of ImageMaps" - checkingResults.sourceItemName = "imageMap" - checkingResults.targetItemName = "map/area and usemap-references" - } - - @Override - protected SingleCheckResults check(final HtmlPage pageToCheck) { - - this.pageToCheck = pageToCheck - - readImageMapAttributesFromHtml( ) - - checkBrokenImageMapReferences() - - checkDuplicateMapNames() - - checkDanglingMaps() - - checkEmptyMaps() - - checkForBrokenHrefLinks() // the major check - - return checkingResults - } - - - /* - search for maps that are NOT referenced by any image-tag - */ - private void checkDanglingMaps() { - - mapNames.each { mapName -> - checkingResults.incNrOfChecks() - - // check if mapName is contained in collection of usemap-references - if (!usemapRefs.contains(mapName)) { - findingText = """ImageMap "${mapName}" not referenced by any image.""" - checkingResults.addFinding(new Finding(findingText)) - } - } - } - - - - - /* - search for maps that are NOT referenced by any image-tag - */ - private void checkEmptyMaps() { - ArrayList areas = new ArrayList() - - mapNames.each { mapName -> - areas = pageToCheck.getAllAreasForMapName(mapName) - - checkingResults.incNrOfChecks() - - // empty map? - if (areas.size() == 0) { - findingText = """ImageMap "${mapName}" has no area tags.""" - checkingResults.addFinding(new Finding(findingText)) - } - } - } - - /* - check for duplicate map names - */ - private void checkDuplicateMapNames() { - int mapNameCount - - Set mapNameSet = mapNames.toSet() - - mapNameSet.each { mapName -> - mapNameCount = mapNames.count( mapName ) - - checkingResults.incNrOfChecks() - - if (mapNameCount > 1) { - // more than one map with this name - findingText = """${mapNameCount} imagemaps with identical name "${mapName}" exist.""" - checkingResults.addFinding(new Finding(findingText)) - } - } - } - - /* - * ... - * a.) if there is no map named "y" -> problem - * b.) if there are more maps named "y" -> problem - */ - private void checkBrokenImageMapReferences() { - String usemapRef - String imageName - int mapCount - - imagesWithUsemapRefs.each { imageTag -> - usemapRef = imageTag.getUsemapRef() - mapCount = mapNames.findAll{ it == usemapRef }?.size() - - checkingResults.incNrOfChecks() - - if (mapCount == 0) { - // no map found, despite img-tag usemap-reference - imageName = imageTag.getImageSrcAttribute() - findingText = """ImageMap "${usemapRef}" (referenced by image "${imageName}") missing.""" - checkingResults.addFinding( new Finding( findingText )) - } - } - } - - /* - check for broken href links. - */ - private void checkForBrokenHrefLinks() { - - mapNames.each { mapName -> - checkAreaHrefsForMapName(mapName) - } - } - - /* - for a specific mapName, check all its contained areaHrefs - */ - private void checkAreaHrefsForMapName( String mapName) { - ArrayList areaHrefs = pageToCheck.getAllHrefsForMapName(mapName) - - // if this List is empty -> the map is empty - // TODO replace checkEmptyMaps with additional check here - areaHrefs.each { href -> - checkingResults.incNrOfChecks() - - // do the actual checking - if (URLUtil.isCrossReference( href )) { - checkLocalHref(href, mapName, areaHrefs) - } - - } - } - - /* - check if href has valid local target - TODO: currently restricted to LOCAL references - TODO: remove duplication to BrokenCrossReferencesChecker - */ - private void checkLocalHref( String href, String mapName,ArrayList areaHrefs ) { - // strip href of its leading "#" - String linkTarget = (href.startsWith("#")) ? href[1..-1] : href - - - if (!listOfIds.contains( linkTarget )) { - - // we found a broken link! - findingText = """ImageMap "${mapName}" refers to missing link \"$linkTarget\"""" - - // now count occurrences - how often is it referenced - int nrOfReferences = areaHrefs.findAll{ it == href }.size() - if (nrOfReferences > 1) { - findingText += ", reference count: $nrOfReferences." - } else findingText += "." - - checkingResults.newFinding(findingText, nrOfReferences) - } - - } - - - /* - set all the interesting attributes - */ - private void readImageMapAttributesFromHtml( ) { - // get all - imagesWithUsemapRefs = pageToCheck.getImagesWithUsemapDeclaration() - - // get all ... - maps = pageToCheck.getAllImageMaps() - - // get the names of all maps - mapNames = pageToCheck.getAllMapNames() - - // get all referenced maps from image tags with usemap-attribute - usemapRefs = pageToCheck.getAllUsemapRefs() - - // list of all id="XYZ" - listOfIds = pageToCheck.getAllIdStrings() - - } - - -} - - -/************************************************************************ - * This is free software - without ANY guarantee! - * - * - * Copyright 2015, Dr. Gernot Starke, arc42.org - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - *********************************************************************** */ diff --git a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/MissingAltInImageTagsChecker.groovy b/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/MissingAltInImageTagsChecker.groovy deleted file mode 100644 index 9e419055..00000000 --- a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/MissingAltInImageTagsChecker.groovy +++ /dev/null @@ -1,75 +0,0 @@ -package org.aim42.htmlsanitycheck.check - -import org.aim42.htmlsanitycheck.Configuration -import org.aim42.htmlsanitycheck.collect.SingleCheckResults -import org.aim42.htmlsanitycheck.html.HtmlElement -import org.aim42.htmlsanitycheck.html.HtmlPage - -/************************************************************************ - * This is free software - without ANY guarantee! - * - * - * Copyright 2013, Dr. Gernot Starke, arc42.org - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - *********************************************************************** */ - -/** - * checks for missing or empty alt-attributes in image tags. - * - */ -class MissingAltInImageTagsChecker extends Checker { - - - MissingAltInImageTagsChecker(Configuration pConfig) { - super(pConfig) - } - - @Override - protected void initCheckingResultsDescription() { - checkingResults.whatIsChecked = "Missing alt-attribute declaration in image tags" - checkingResults.sourceItemName = "image tags" - checkingResults.targetItemName = "missing alt attributes" - } - - - @Override - protected SingleCheckResults check( final HtmlPage pageToCheck ) { - // the number of checks is calculated by counting - // ALL image tags: - checkingResults.setNrOfChecks( pageToCheck.getAllImageTags().size()) - - // see HtmlPageSpec for behavior: missing or empty alt-attributes are included... - pageToCheck.getAllImageTagsWithMissingAltAttribute().each { element -> - reportSingleImageTagWithMissingAlt(element) - } - - return checkingResults - } - - /* - - */ - - private void reportSingleImageTagWithMissingAlt(HtmlElement element) { - - String imageName = element.imageSrcAttribute - - String findingText = """image \"$imageName\" is missing alt-attribute""" - - checkingResults.newFinding(findingText) - - } - -} diff --git a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/MissingImageFilesChecker.groovy b/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/MissingImageFilesChecker.groovy deleted file mode 100644 index 46d23487..00000000 --- a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/MissingImageFilesChecker.groovy +++ /dev/null @@ -1,146 +0,0 @@ -package org.aim42.htmlsanitycheck.check - -import org.aim42.htmlsanitycheck.Configuration -import org.aim42.htmlsanitycheck.collect.SingleCheckResults -import org.aim42.htmlsanitycheck.html.HtmlElement -import org.aim42.htmlsanitycheck.html.HtmlPage -import org.aim42.htmlsanitycheck.html.URLUtil -import org.slf4j.Logger -import org.slf4j.LoggerFactory - -// see end-of-file for license information - - -class MissingImageFilesChecker extends Checker { - - private List images - private File baseDir - private File currentDir - - // logging stuff - private final static Logger logger = LoggerFactory.getLogger(MissingImageFilesChecker); - - - public MissingImageFilesChecker( Configuration pConfig) { - super( pConfig ) - baseDir = myConfig.getConfigItemByName( Configuration.ITEM_NAME_sourceDir ) - } - - @Override - protected void initCheckingResultsDescription() { - - checkingResults.whatIsChecked = "Missing Local Images Check" - checkingResults.sourceItemName = "img src attributes" - checkingResults.targetItemName = "missing image files" - } - - - @Override - protected SingleCheckResults check(final HtmlPage pageToCheck) { - currentDir = pageToCheck.file?.parentFile ?: baseDir - - //get list of all image-tags " - checkSingleLocalImage(image) - } - } - - - private void checkSingleLocalImage(HtmlElement image) { - String imageSrcAttribute = image.getImageSrcAttribute() - - // check only "local" image references - // (that is, NO remote URL) - Boolean isRemoteURL = URLUtil.isRemoteURL(imageSrcAttribute) - Boolean isDataURI = URLUtil.isDataURI(imageSrcAttribute) - if (isRemoteURL) { - //do nothing. This checks for _local_ images - } else if (isDataURI) { - // bookkeeping: - checkingResults.incNrOfChecks() - - doesDataURIContainData( imageSrcAttribute ); - - } else { - //we have a simple local image - - // bookkeeping: - checkingResults.incNrOfChecks() - - doesImageFileExist( imageSrcAttribute ); - } - } - - - - /** - * check if a single image file exists - * - * @param relativePathToImageFile == XYZ in - **/ - private void doesImageFileExist(String relativePathToImageFile) { - File parentDir = relativePathToImageFile?.startsWith("/") ? baseDir : currentDir; - - String decodedRelativePathtoImageFile = URLDecoder.decode(relativePathToImageFile); - - File imageFile = new File(parentDir, decodedRelativePathtoImageFile); - - if (!imageFile.exists() || imageFile.isDirectory()) { - String findingText = "image \"$relativePathToImageFile\" missing" - checkingResults.newFinding(findingText) - } - - } - - /** - * check if the given data-URI contains actual data - * - * Good: "..." - * - * Bad: "data:image/jpg;base64," - * - * @param dataURI == XYZ in - **/ - private void doesDataURIContainData(String dataURI) { - // let's do a simple regexp - - if (dataURI ==~ "^data:image/[a-z]+;base64,") { - String findingText = "data-URI image missing" - checkingResults.newFinding(findingText) - } - } - - -} - -/*======================================================================== - Copyright Gernot Starke and aim42 contributors - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an - "AS IS" BASIS,WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, - either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - ========================================================================*/ - - diff --git a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/MissingLocalResourcesChecker.groovy b/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/MissingLocalResourcesChecker.groovy deleted file mode 100644 index f3988b8b..00000000 --- a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/MissingLocalResourcesChecker.groovy +++ /dev/null @@ -1,167 +0,0 @@ -package org.aim42.htmlsanitycheck.check - -import org.aim42.htmlsanitycheck.Configuration -import org.aim42.htmlsanitycheck.collect.SingleCheckResults -import org.aim42.htmlsanitycheck.html.HtmlPage -import org.aim42.htmlsanitycheck.html.URLUtil -import org.slf4j.Logger -import org.slf4j.LoggerFactory - -class MissingLocalResourcesChecker extends Checker { - - public static final String MLRC_MESSAGE_PREFIX = "local resource" - public static final String MLRC_MESSAGE_MISSING = "missing" - public static final String MLRC_REFCOUNT = ", reference count: " - - // List of the local resources referenced in anchor tags - private List localResourcesList - - // unique local references - every one is unique - // created from the List of all by toSet() method - private Set localResourcesSet - - /** - * The base directory to resolve absolute paths. - */ - private File baseDir - - /** - * The current directory, obtained from the HtmlPage, to resolve - * relative paths. - */ - private File currentDir - - /** - * True to require files to be referenced and not directories. Useful if the web server doesn't - * support a default directory, such as Amazon S3. - */ - private boolean requireFiles = false - - // logging stuff - private final static Logger logger = LoggerFactory.getLogger(MissingLocalResourcesChecker.class); - - public MissingLocalResourcesChecker( Configuration pConfig ) { - super( pConfig ) - baseDir = pConfig.getConfigItemByName( Configuration.ITEM_NAME_sourceDir ) - } - - @Override - protected void initCheckingResultsDescription() { - checkingResults.whatIsChecked = "Missing Local Resources Check" - checkingResults.sourceItemName = "anchor tag href attribute" - checkingResults.targetItemName = "missing local resources" - } - - @Override - protected SingleCheckResults check(final HtmlPage pageToCheck) { - //get list of all anchor-tags containing href="xyz" in html file - List allHrefs = pageToCheck.getAllHrefStrings() - - // now filter out all local resources - localResourcesList = allHrefs.findAll { - URLUtil.isLocalResource( it ) - } - - // filter duplicates by reducing to set - localResourcesSet = localResourcesList.toSet() - - logger.debug """local resources set: ${localResourcesSet}""" - - currentDir = pageToCheck.file?.parentFile ?: baseDir - - // perform the actual checks - checkAllLocalResources( localResourcesSet ) - - return checkingResults - - } - - /* - * iterate over the SET of all local resources - */ - private void checkAllLocalResources( Set localResources ) { - localResources.each { localResource -> - checkSingleLocalResource( localResource ) - } - } - - - /* - check a single resource: - - @param localResource can be either: - - file.ext - - dir/file.ext - - file:/dir/file.ext - - file.ext#anchor - - - see #252 (false positives), localResource can be a /example string referencing a file "/example.html" - This special case is called "prefixOnlyHref" - */ - - private void checkSingleLocalResource( String localResource ) { - // the localResource is either path+filename or filename or directory - - logger.debug( "single resource to be checked: + $localResource" ) - - // bookkeeping: - checkingResults.incNrOfChecks() - - // we need to strip the localResource of #anchor-parts - String localResourcePath = new URI( localResource ).getPath() - - if (localResourcePath == null) { - // For example, javascript:; - return - } - - File parentDir = localResourcePath?.startsWith("/") ? baseDir : currentDir; - - // we need the baseDir for robust checking of local resources... - File localFile = new File( parentDir, localResourcePath ); - - // action required if resource does not exist - if (!localFile.exists() || !localFile.isFile()) { - handleNonexistingLocalResource( localResource ) - } - } - - /* - create error message and reference count - */ - private handleNonexistingLocalResource(String nonExistingLocalResource) { - String findingText = """$MLRC_MESSAGE_PREFIX \"${nonExistingLocalResource}\" $MLRC_MESSAGE_MISSING""" - - // how often is localResource referenced? - int nrOfOccurrences = localResourcesList.count(nonExistingLocalResource) - - if (nrOfOccurrences > 1) - findingText += MLRC_REFCOUNT + nrOfOccurrences - - // add Finding to our current checking results, increment nrOfFindings by nrOfOccurrences - checkingResults.newFinding(findingText, nrOfOccurrences) - } - - -} - - -/************************************************************************ - * This is free software - without ANY guarantee! - * - * - * Copyright Dr. Gernot Starke and aim42 contributors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - *********************************************************************** */ diff --git a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/SuggestingChecker.groovy b/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/SuggestingChecker.groovy deleted file mode 100644 index a1f27e6c..00000000 --- a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/SuggestingChecker.groovy +++ /dev/null @@ -1,95 +0,0 @@ -package org.aim42.htmlsanitycheck.check - -import org.aim42.htmlsanitycheck.Configuration -import org.aim42.htmlsanitycheck.collect.Finding -import org.aim42.htmlsanitycheck.collect.SingleCheckResults -import org.aim42.htmlsanitycheck.html.HtmlPage -import org.aim42.htmlsanitycheck.suggest.Suggester - - -/** - * Abstract class for those @see Checker subclasses that - * can propose suggestions, not only identify errors. - * Example: MissingImagesChecker might suggest names of existing images - * that "could have been meant" - * - */ -abstract class SuggestingChecker extends Checker { - - // valid possibilities for e.g. image-file-names or link-targets - ArrayList validPossibilities - - SuggestingChecker(Configuration pConfig) { - super(pConfig) - } - - @Override - abstract protected void initCheckingResultsDescription() - - /** let the instance determine the list of possible values - * Examples: - * - MissingImageFilesChecker -> collect the names of images files - * - BrokenCrossReferencesChecker -> collect all (internal) link targets - **/ - abstract protected void setValidPossibilities() - - - @Override - abstract protected SingleCheckResults check( final HtmlPage pageToCheck) - - - /** - * a little tricky: call performCheck on the superclass and add a little behavior :-) - * it's a Template-Method again. - * @return List of Findings (SingleCheckResults), but with suggestions for each finding - */ - @Override - public final SingleCheckResults performCheck( HtmlPage pageToCheck) { - SingleCheckResults scResults = super.performCheck( pageToCheck ) - - setValidPossibilities() - - determinSuggestionsForEveryFinding() - - return scResults - } - - - /** - * determines suggestions for every Finding agains the list - * of valid possibilities - */ - public void determinSuggestionsForEveryFinding() { - checkingResults.findings.each { finding -> - determineSuggestionsForSingleFinding( finding ) - } - } - - /** - * - */ - public void determineSuggestionsForSingleFinding( Finding finding ) { - finding.setSuggestions( Suggester.determineNSuggestions(finding.whatIsTheProblem, validPossibilities, 1)) - - } -} - -/************************************************************************ - * This is free software - without ANY guarantee! - * - * - * Copyright 2013, Dr. Gernot Starke, arc42.org - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - *********************************************************************** */ diff --git a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/UnknownCheckerException.groovy b/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/UnknownCheckerException.groovy deleted file mode 100644 index 349bac73..00000000 --- a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/UnknownCheckerException.groovy +++ /dev/null @@ -1,33 +0,0 @@ -package org.aim42.htmlsanitycheck.check; - -public class UnknownCheckerException extends Exception { - - public UnknownCheckerException(String message) { - super( message); - } - - public UnknownCheckerException(String message, String checkerName) { - super( message + ": " + checkerName ); - } -} - -/************************************************************************ - * This is free software - without ANY guarantee! - *

- * Copyright Dr. Gernot Starke, arc42.org - *

- * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - *

- * http://www.apache.org/licenses/LICENSE-2.0 - *

- * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - *

- * ********************************************************************** - */ - diff --git a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/collect/SingleCheckResults.groovy b/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/collect/SingleCheckResults.groovy deleted file mode 100644 index c35d93e0..00000000 --- a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/collect/SingleCheckResults.groovy +++ /dev/null @@ -1,176 +0,0 @@ -// see end-of-file for license information - -package org.aim42.htmlsanitycheck.collect -/** - * collects results for a specific type of @see Checker - * (i.e. missing images, broken cross-references). - * - * @author Gernot Starke - */ - -class SingleCheckResults implements CheckResults { - - String whatIsChecked // i.e. "Missing Local Images Check" - - // source-whatIsTheProblem is checked against target-whatIsTheProblem - String sourceItemName // i.e. image-src-attribute, anchor/link - String targetItemName // i.e. local-image-file, id/bookmark - - String generalRemark // i.e. "Internet not available" - - int nrOfItemsChecked - - private int nrOfIssues - // nrOfIssues can be larger than findings.size(), - // if some findings occur more than once - - // the actual findings - public ArrayList findings - - - /** - * Initialize some members. - * - * Other members are set by the Checker-instance - * owning this SingleCheckResults. - */ - public SingleCheckResults() { - - this.nrOfItemsChecked = 0 - this.nrOfIssues = 0 - this.findings = new ArrayList() - this.generalRemark = "" - } - - - /** - * add a single finding to the collection, - * @param message: what kind of finding is it? - */ - public void newFinding( String message ) { - addFinding( new Finding( message ), 1) - } - - /** - * add a single finding to the collection, - * @param message: what kind of finding is it? - * @param nrOfOccurrences: how often does this occur? - */ - public void newFinding( String message, int nrOfOccurrences ) { - addFinding( new Finding( message ), nrOfOccurrences) - } - - - /** - * add a single finding to the collection of Finding instances - * @param singleFinding - */ - public void addFinding(Finding singleFinding) { - findings.add(singleFinding) - incNrOfIssues() - } - - /** - * add single Finding with multiple occurrences - */ - public void addFinding( Finding singleFinding, int nrOfOccurrences ) { - findings.add( singleFinding ) - addNrOfIssues( nrOfOccurrences ) - } - - /** - * bookkeeping on the number of checks - */ - public void incNrOfChecks() { - nrOfItemsChecked += 1 - } - - public void addNrOfChecks( int nrOfChecksToAdd ) { - nrOfItemsChecked += nrOfChecksToAdd - } - - public void setNrOfChecks( int nrOfChecks ) { - nrOfItemsChecked = nrOfChecks - } - - /** - * bookkeeping on the number of issues - */ - public void incNrOfIssues() { - nrOfIssues += 1 - } - - public void addNrOfIssues( int nrOfIssuesToAdd ) { - nrOfIssues += nrOfIssuesToAdd - } - - - /** - * @return a description of what is checked - */ - @Override - public String description() { - return whatIsChecked - } - - - @Override - public ArrayList getFindings() { - return findings - } - - /** - * return a collection of finding-messages - * (used to simplify testing) - */ - public ArrayList getFindingMessages() { - ArrayList messages = new ArrayList() - - findings.each { finding -> - messages.add( finding.whatIsTheProblem ) - } - return messages - } - - - /** - * - * @return ( int ) the nr of issues/findings found for this checkingResults. - */ - public int nrOfProblems() { - return nrOfIssues - } - - - - - @Override - public String toString() { - int nrOfProblems = nrOfProblems() - return "Checking results for $whatIsChecked" + '\n' + - " $nrOfItemsChecked $sourceItemName checked," + '\n' + - " $nrOfProblems finding(s)" + '\n' + - findings.each { it.toString() + '\n'} - - } - - -} - -/*===================================================================== - Copyright Gernot Starke and aim42 contributors - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an - "AS IS" BASIS,WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, - either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - =====================================================================*/ - diff --git a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/html/HtmlElement.groovy b/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/html/HtmlElement.groovy deleted file mode 100644 index 9620e123..00000000 --- a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/html/HtmlElement.groovy +++ /dev/null @@ -1,107 +0,0 @@ -package org.aim42.htmlsanitycheck.html - -import org.jsoup.nodes.Element - -// see end-of-file for license information - -/** - * Encapsulates a single HTML element with attributes - * Relies on jsoup.select.Element - */ -class HtmlElement { - - private Element element - - public HtmlElement(Element element) { - this.element = element - } - - /** - * @return XYZ for img src="XYZ" tags - */ - public String getImageSrcAttribute() { - if (element.tagName().equals("img")) - element.attr("src") - else return "" - } - - /** - * @return XYZ for XYZ - */ - public String getImageAltAttribute() { - if (element.tagName().equals("img")) - element.attr("alt") - else return "" - } - - /** - * @return XYZ for 'a href="XYZ"' tags - */ - public String getHrefAttribute() { - if (element.tagName().equals("a")) { - return element.attr("href") - } else return "" - } - - /** - * @return XYZ for 'id="XYZ"' attributes - */ - public String getIdAttribute() { - return element.attr("id") - - } - - /** - * @return x for ' - */ - public String getUsemapRef() { - String tmpUsemapRef = "" - - if (element.tagName().equals("img")) { - tmpUsemapRef = HtmlElement.normalizeHrefString( element.attr("usemap") ) - } - return tmpUsemapRef - } - - @Override - public String toString() { - return element.toString() - } - - /* - convert href to string - */ - - private static String normalizeHrefString(String href) { - String normalizedHref - - // local href, starting with "#" (e.g. #appendix or #_appendix - if (href.startsWith("#")) { - normalizedHref = href[1..-1] // cut off first letter - } - // empty href might be treated differently one day... - else if (href == "") { - normalizedHref = "" - } else normalizedHref = href - - return normalizedHref - } - -} - -/*======================================================================== - Copyright Gernot Starke and aim42 contributors - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an - "AS IS" BASIS,WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, - either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - ========================================================================*/ diff --git a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/html/HtmlPage.groovy b/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/html/HtmlPage.groovy deleted file mode 100644 index d00d13a8..00000000 --- a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/html/HtmlPage.groovy +++ /dev/null @@ -1,325 +0,0 @@ -package org.aim42.htmlsanitycheck.html - -import groovy.transform.Memoized -import org.jsoup.Jsoup -import org.jsoup.nodes.Document -import org.jsoup.select.Elements - -import java.util.regex.Pattern - -// see end-of-file for license information - -/** - * Encapsulates a "real" html parser and provides - * convenience methods to access anchor and image links - * from html. - * - * Relies on http://jsoup.org parser - */ -class HtmlPage { - - /** - * Pattern to check for HTTP/S scheme, includes the - * scheme separator (colon). - */ - private static final Pattern HTTP_SCHEME_PATTERN = ~/(?i)^https?:/ - - // jsoup Document - private Document document - - /** - * The HTML file. - */ - private File file - - /** - * - * @param text html as text (string) - * @return an HtmlPage - */ - public HtmlPage(String text) { - // Jsoup promises to parse without exception - - // we believe it, as our wrapper is for checking - // purposes only - document = Jsoup.parse(text, "UTF-8") - } - - /** - * @param file - * @return an HtmlPage - */ - public HtmlPage(File file) { - assert file.exists() - this.file = file - document = Jsoup.parse(file, "UTF-8") - } - - /** - * invokes the parser for the html page - * @param input file - */ - public static HtmlPage parseHtml(File fileToCheck) { - assert fileToCheck.exists() - return new HtmlPage(fileToCheck) - } - - /** - * Gets the file of the HTML page. - * @return the file, or null if the HTML is not from a file. - */ - public File getFile() { - return file; - } - - /** - * get document meta info (e.g. filename, title, size etc.) - */ - public int getDocumentSize() { - return document.toString().length() - } - - public String getDocumentTitle() { - return document.title() - } - - public String getDocumentURL() { - return document.nodeName() - } - - public String getDocument() { - return document.toString() - } - - /** - * builds a list of all imageMaps - * @return ArrayList of imageMaps - */ - @Memoized - public final ArrayList getAllImageMaps() { - Elements elements = document?.select("map") - return toHtmlElementsCollection(elements) - } - - /** - * @return list of all imageMap-names - */ - @Memoized - public final ArrayList getAllMapNames() { - ArrayList mapNames = new ArrayList() - - Elements maps = document?.select("map") - - maps.each { map -> - mapNames.add(map.attr("name")) - } - return mapNames - } - - /** - * @return list of all usemap-references y with getAllUsemapRefs() { - ArrayList usemapRefs = new ArrayList() - - getImagesWithUsemapDeclaration().each { image -> - usemapRefs.add(image.getUsemapRef()) - - } - return usemapRefs - } - - /** - * builds a list from all '' tags - * @return immutable ArrayList - */ - @Memoized - public final ArrayList getAllImageTags() { - Elements elements = document?.getElementsByTag("img") - - return toHtmlElementsCollection(elements) - - // alternative: document?.getElementsByTag("img").asList() - } - - /** - * builds an immutable list of 'yz, - * where "yz" is non-empty. - */ - @Memoized - public final ArrayList getAllImageTagsWithNonEmptyAltAttribute() { - // regex "\S" matches any word - Elements elements = document?.select("img[alt~=(\\S)]") - - return toHtmlElementsCollection(elements) - } - - /** - * builds an immutable list of tags, where - * the alt-tag is missing or empty (""). - */ - @Memoized - public final ArrayList getAllImageTagsWithMissingAltAttribute() { - Elements elements = document?.select("img") - document?.select("img[alt~=(\\S)]") - - return toHtmlElementsCollection(elements) - } - - /** - * builds a list of all ' tags - * @return ArrayList of all hrefs, including the "#" - */ - @Memoized - public final ArrayList getAllAnchorHrefs() { - Elements elements = document.select("a[href]") - - return toHtmlElementsCollection(elements) - } - - /** - * builds a list of all 'id="XYZ"' attributes - * @return ArrayList of all hrefs - */ - @Memoized - public final ArrayList getAllIds() { - Elements elements = document.getElementsByAttribute("id") - - return toHtmlElementsCollection(elements) - } - - /** - * - * @return ArrayList < String > of all href-attributes - * - * common pitfalls with hrefs: - * - local hrefs start with # (like "#appendix") - * - remote hrefs should be valid URLs (like "https://google.com") - * - remote hrefs might start with other than http (e.g. https, mailto, telnet, ssh) - * - hrefs might start with file:// - * - href might be empty string (nobody knows wtf this is good for, but html parsers usually accept it) - */ - @Memoized - public final ArrayList getAllHrefStrings() { - Elements elements = document.select("a[href]") - - ArrayList hrefStrings = new ArrayList<>() - - elements.each { element -> - String href = element.attr("href") - - hrefStrings.add(href) - } - - return hrefStrings - } - - /** - * @return immutable set of all href-attributes that start with http or https - * */ - @Memoized - public final Set getAllHttpHrefStringsAsSet() { - Elements elements = document.select("a[href]") - - return elements - .collect { it.attr("href") } - .findAll { it =~ HTTP_SCHEME_PATTERN } - .toSet() - - } - /** - * @return immutable List of img-tags with "usemap=xyz" declaration - */ - @Memoized - public final ArrayList getImagesWithUsemapDeclaration() { - Elements elements = document?.select("img[usemap]") - - return toHtmlElementsCollection(elements) - } - - /** - * html-map has the following form: - * - * - * collect all area elements for a given map. - * If more than one map exists with this name, areas - * for all maps are combined into one. - * @param mapName name of the map - * @return - */ - @Memoized - public final ArrayList getAllAreasForMapName(String mapName) { - // get all maps with name==mapName - Elements mapsWithName = document?.select("map[name=${mapName}]") - - ArrayList areas = new ArrayList() - - mapsWithName.each { map -> - areas += map.children().select("area") - } - return areas - } - - - @Memoized - public final ArrayList getAllHrefsForMapName(String mapName) { - ArrayList hrefs = new ArrayList() - - ArrayList areas = getAllAreasForMapName(mapName) - - areas?.each { area -> - hrefs += area.attr("href") - } - - return hrefs - } - - /** - * getAllIdStrings return ArrayList of all id="xyz" definitions - */ - @Memoized - public final ArrayList getAllIdStrings() { - Elements elements = document.getElementsByAttribute("id") - - ArrayList idList = new ArrayList<>() - - elements.each { element -> - idList.add(element.attr("id")) - } - - return idList - } - - /** - * convert JSoup Elements to ArrayList - */ - @Memoized - private final ArrayList toHtmlElementsCollection(Elements elements) { - - ArrayList arrayList = new ArrayList<>() - - elements.each { element -> - arrayList.add(new HtmlElement(element)) - } - - return arrayList - } - - -} -/*======================================================================== - Copyright Gernot Starke and aim42 contributors - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an - "AS IS" BASIS,WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, - either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - ========================================================================*/ - diff --git a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/html/ParserSample.groovy b/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/html/ParserSample.groovy deleted file mode 100644 index ace75351..00000000 --- a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/html/ParserSample.groovy +++ /dev/null @@ -1,96 +0,0 @@ -package org.aim42.htmlsanitycheck.html - -import org.jsoup.Jsoup -import org.jsoup.nodes.Document -import org.jsoup.nodes.Element -import org.jsoup.select.Elements - -// see end-of-file for license information - -/** - * demo-code for the Jsoup html parser - largely taken - * from their website. - * - * Parses the plugin's own readme.html file... - * and reports some links. - * - */ - -class ParserSample { - - public static void main(String[] args) { - final String userDir = System.getProperty("user.dir") - final String fileName = 'README.html' - final String localPath = "/" - final String filePath = userDir + localPath + fileName - final String pathToThisClass = new File(".").getAbsolutePath() - - - println "canonicalPath = $pathToThisClass" - - - final File file = new File(filePath) - print("Fetching %s...", filePath); - - println "file $filePath exists: " + new File(filePath).exists() - Document doc = Jsoup.parse( file, "UTF-8" ); - - HtmlPage page = new HtmlPage( file ) - ArrayList imgs = page.getAllImageTags() - - print "found %d images", imgs.size() - - print "first image" - println imgs.first().getImageSrcAttribute() - - Elements links = doc.select("a[href]"); - Elements media = doc.select("[src]"); - Elements imports = doc.select("link[href]"); - - print("\nMedia: (%d)", media.size()); - for (Element src : media) { - if (src.tagName().equals("img")) - print(" * %s: <%s> %sx%s (%s)", - src.tagName(), src.attr("src"), src.attr("width"), src.attr("height"), - trim(src.attr("alt"), 20)); - else - print(" * %s: <%s>", src.tagName(), src.attr("src")); - } - - print("\nImports: (%d)", imports.size()); - for (Element link : imports) { - print(" * %s <%s> (%s)", link.tagName(),link.attr("abs:href"), link.attr("rel")); - } - - // how to get src-attribute from img-tags: - Elements images = doc.getElementsByTag("img") - - print("\n Images: (%d)", images.size() ) - images.each { imageTag -> - print(" * %s <%s> %s", - imageTag.tagName(), // img - imageTag.attributes(), // src="XYZ" - imageTag.attr("src")) // XYZ - } - - println "Elements are of class " + links.getClass() - - print("\nLinks: (%d)", links.size()); - for (Element link : links) { - print(" * a: <%s> (%s)", link.attr("abs:href"), trim(link.text(), 35)); - } - } - - private static void print(String msg, Object... args) { - System.out.println(String.format(msg, args)); - } - - private static String trim(String s, int width) { - if (s.length() > width) - return s.substring(0, width-1) + "."; - else - return s; - } - -} - diff --git a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/html/URLUtil.groovy b/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/html/URLUtil.groovy deleted file mode 100644 index 8efdd745..00000000 --- a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/html/URLUtil.groovy +++ /dev/null @@ -1,187 +0,0 @@ -package org.aim42.htmlsanitycheck.html - -import java.util.regex.Matcher -import java.util.regex.Pattern - -/** - * functions to identify categories of string-representations of URLs and URIs, - * e.g. isRemote, isCrossReference, isValidIP - */ - -class URLUtil { - - // the foundation for the ip_address_pattern: - // http://stackoverflow.com/questions/5667371/validate-ipv4-address-in-java and - // http://groovy.codehaus.org/Regular+Expressions - protected static - final Pattern ip_address_pattern = ~/(([01]?\d\d?|2[0-4]\d|25[0-5])\.){3}([01]?\d\d?|2[0-4]\d|25[0-5]).*$/ - - /** - * Checks if this String represents a valid URI/URL - * @param link - * @return boolean - */ - public static boolean isValidURL(String link) { - // TODO: refactor this code to use org.apache.commons.validator.routines.* - - boolean isValid = false - - if (isCrossReference(link)) { - return true - } else { - try { - URI aUri = new URL(link).toURI() - isValid = true - } - catch (MalformedURLException e) { - isValid = false - // ignore - - } - catch (URISyntaxException e1) { - isValid = false - } - } - - return isValid - } - - /** - * Checks if this String represents a remote URL - * (startsWith http, https, ftp, telnet...) - * @param link - * */ - - public static boolean isRemoteURL(String link) { - // simple regular expression to match http://, https:// and ftp:// - - return (link ==~ (/^(?i)(https?|ftp|telnet|ssh|ssl|gopher|localhost):\/\/.*$/) - || - - // special case for mailto-links - link ==~ (/^(?i)(mailto):.*$/) - || - - // special case for URLs starting with a valid IP address - ip_address_pattern.matcher(link).matches() - ) - } - - /** - * Checks if this String represents a data-image-URI - * (startsWith "data:image" - * @param s - * */ - - static boolean isDataURI(String s) { - // simple regular expression to match data:image - - return (s ==~ (/^(?i)(data:image).*$/)) - } - - /** - * Checks if this String represents a local resource, either: - * (1) "file://path/filename.ext" or - * (2) is a path, e.g. "directory/filename.ext" or directory or - * (3) starts with //, e.g. "index.html" - * - * @see class URLUtilSpec for details - */ - public static boolean isLocalResource(String link) { - - // handle corner cases - if ((link == null) - || containsInvalidChars(link) - || (link == "") - || isCrossReference(link) // "#link" or similar - || isRemoteURL(link) // "mailto:", "http" etc - - ) - return false - - else { - URI aUri = new URI(link) - - return ( - (isLinkToFile(aUri)) // (1) - || - (link ==~ (/^\/\/.*$/)) // (3) - || - (aUri.getPath() != "") // (2) - ) - } - - } - - /** - * helper to identify invalid characters in link - * @param aLink - */ - public static boolean containsInvalidChars(String aLink) { - // TODO finding illegal chars with a regex is overly simple, - // as different chars are allowed in different parts of an URI... - // simple solution works for htmlSanityCheck - - - String illegalCharsRegex = / |\*|\$/ - - Matcher matcher = (aLink =~ illegalCharsRegex) - - // assert matcher instanceof Matcher - - return matcher.find() - } - - /* - ** helper to identify "file scheme" - */ - - private static Boolean isLinkToFile(URI aUri) { - - aUri?.getScheme()?.equalsIgnoreCase("file") - } - - /** - * Checks if this String represents a cross-reference, - * that is an intra-document link - * @param xref - */ - public static boolean isCrossReference(String xref) { - - // the simple test is if the xref starts with "#" - - return (xref.startsWith("#") && !containsInvalidChars(xref)) - - } - - /** - * validate an IP address - * @see URLUtilSpec for details - * @param ipa - the candidate ip address - */ - public static boolean isValidIP(String ipa) { - return ipa ==~ /^(([01]?\d\d?|2[0-4]\d|25[0-5])\.){3}([01]?\d\d?|2[0-4]\d|25[0-5])$/ - } - -} - -/************************************************************************ - * This is free software - without ANY guarantee! - * - * - * Copyright Dr. Gernot Starke, arc42.org - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - *********************************************************************** */ - diff --git a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/report/CreateLinkUtil.groovy b/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/report/CreateLinkUtil.groovy deleted file mode 100644 index 67ac6183..00000000 --- a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/report/CreateLinkUtil.groovy +++ /dev/null @@ -1,20 +0,0 @@ -package org.aim42.htmlsanitycheck.report - -// see end-of-file for license information - -/** - * trivial class to convert filenames to html link targets. - * E.g. the string "/dir/onefile.html" can be converted - * to "XdirXonefileXhtml" or similar. - */ -class CreateLinkUtil { - - public static String convertToLink( String stringWithNonWordChars ) { - - // \W is regex for all non-word characters - def regex = /\W/ - - return stringWithNonWordChars.replaceAll( regex, "X") - } - -} diff --git a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/report/Reporter.groovy b/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/report/Reporter.groovy deleted file mode 100644 index b8df1f50..00000000 --- a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/report/Reporter.groovy +++ /dev/null @@ -1,141 +0,0 @@ -package org.aim42.htmlsanitycheck.report - -import org.aim42.htmlsanitycheck.ProductVersion -import org.aim42.htmlsanitycheck.collect.PerRunResults -import org.aim42.htmlsanitycheck.collect.SingleCheckResults -import org.aim42.htmlsanitycheck.collect.SinglePageResults - -/** - * superclass for reporting results. - * Subclasses will define the concrete output format - */ - -abstract class Reporter { - - // TODO: rewrite tests to work with PerRunResults - protected ArrayList pageResults - - // subclasses need to access runResults... - protected PerRunResults runResults - - protected final String createdOnDate - protected final String createdByHSCVersion - - - /** - * create the reporter - */ - public Reporter() { - this.createdOnDate = new Date().format('dd. MMMM YYYY, HH:mm') - this.createdByHSCVersion = ProductVersion.version - - } - - - - /** - * Usually a Reporter instance shall be constructed with its appropriate - * @see PerRunResults, as the latter contains all findings. - * @param runResults - */ - public Reporter( PerRunResults runResults ) { - this() - this.runResults = runResults - this.pageResults = runResults.getResultsForAllPages() - - } - - - /** - * add checking results for one page - */ - public addCheckingResultsForOnePage( SinglePageResults singlePageResults) { - pageResults.add( singlePageResults ) - pageResults.sort(); // enforce sorting, fixing issue #128 - } - - /** - * main entry point for reporting - to be called when a report is requested - * - * Uses template-method to delegate most concrete implementations to subclasses - */ - public void reportFindings() { - - initReport() - - reportOverallSummary() - - reportAllPages() - - closeReport() - } - - private void reportAllPages() { - pageResults.each { pageResult -> - reportPageSummary( pageResult ) // delegated to subclass - reportPageDetails( pageResult ) // implemented below - reportPageFooter() // delegated to subclass - } - } - - protected void reportPageDetails( SinglePageResults pageResults ) { - pageResults.singleCheckResults.each { resultForOneCheck -> - reportSingleCheckSummary( resultForOneCheck ) - reportSingleCheckDetails( resultForOneCheck ) - } - } - - - protected int totalNrOfPages() { - return pageResults.size() - } - - protected int totalNrOfChecks() { - return runResults.nrOfChecksPerformedOnAllPages() - } - - protected int totalNrOfFindings() { - return runResults.nrOfFindingsOnAllPages() - } - - // delegate *real* work to subclasses - - // needs to e.g. open files or streams - protected void initReport() { - // default: do nothing - } - - abstract protected void reportOverallSummary() - - abstract protected void reportPageSummary( SinglePageResults pageResult ) - abstract protected void reportPageFooter( ) - - abstract protected void reportSingleCheckSummary( SingleCheckResults singleCheckResults ) - abstract protected void reportSingleCheckDetails( SingleCheckResults singleCheckResults ) - - protected void closeReport() { - // default: do nothing - } - - -} - -/************************************************************************ - * This is free software - without ANY guarantee! - * - * - * Copyright, Dr. Gernot Starke and aim42.org committers - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - *********************************************************************** */ diff --git a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/report/SummarizerUtil.groovy b/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/report/SummarizerUtil.groovy deleted file mode 100644 index 50dc5637..00000000 --- a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/report/SummarizerUtil.groovy +++ /dev/null @@ -1,59 +0,0 @@ -package org.aim42.htmlsanitycheck.report - -class SummarizerUtil { - - /** - * returns the percentage of successful checks. - * - * Edge case: - * 0 checks -> 100% successful - * - */ - public static int percentSuccessful(int totalChecks, int totalNrOfFindings) { - - - // base case: if no checks performed, 100% successful - if (totalChecks <= 0) { - return 100 - } - // at least one check was performed, calculate percentage - else { - return 100 - (100 * totalNrOfFindings) / totalChecks - } - } - - /** - * rounds one down to at most 3 digits with two decimalplaces, - * e.g. from - * 33450 to 33.45, from 1_234_566 to 1.23 - */ - public static float threeDigitTwoDecimalPlaces( int bigNumber ) { - - - if (bigNumber >= 1_000_000) - return Math.round( bigNumber.div(10_000)).div(100) - else return Math.round( bigNumber.div(10)).div(100) - - } -} - - -/************************************************************************ - * This is free software - without ANY guarantee! - * - * - * Copyright 2013, Dr. Gernot Starke, arc42.org - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - *********************************************************************** */ diff --git a/htmlSanityCheck-core/src/main/groovy/org/aim42/inet/NetUtil.groovy b/htmlSanityCheck-core/src/main/groovy/org/aim42/inet/NetUtil.groovy deleted file mode 100644 index fc6dbb69..00000000 --- a/htmlSanityCheck-core/src/main/groovy/org/aim42/inet/NetUtil.groovy +++ /dev/null @@ -1,42 +0,0 @@ -package org.aim42.inet - -// see end-of-file for license information - - -class NetUtil { - - // the codes below can be overwritten by configuration! - - // these are regarded as "Success" when checking - // http(s) links - final static def HTTP_SUCCESS_CODES = (200..208) + [226] - - // for codes in the HTTP_WARNING_CODES, a warning is added to the findings - final static def HTTP_WARNING_CODES = (100..102) + (300..308) - - // error codes - final static def HTTP_ERROR_CODES = (400..451) + (500..511) - - // typical extensions used with "prefixOnlyHrefs" (e.g. ) - final static def POSSIBLE_EXTENSIONS = ["html", "htm", "shtml", "phtml", "php","asp", "aspx", "xml"] - - /** - * We try to check if there is a usable Internet connection available. - * Our approximation is DNS resolution: if google.com can be resolved to an IP address, - * there should be an active and usable internet connection available. - * - * @return true if Internet is (seemingly available - */ - static boolean isInternetConnectionAvailable() { - - try { - // if we can get google's address, there is Internet... - InetAddress.getByName("google.com"); - return true - } catch (UnknownHostException e) { - // we cannot resolve google, there might be no internet connection - return false - } - } -} - diff --git a/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/AscendingSimilarityScoreComparator.java b/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/AscendingSimilarityScoreComparator.java index 52d08d3d..42d7c5e4 100644 --- a/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/AscendingSimilarityScoreComparator.java +++ b/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/AscendingSimilarityScoreComparator.java @@ -7,10 +7,10 @@ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: - * + * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. - * + * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -28,28 +28,28 @@ /** * A comparator that allows SimilarityScore to be sorted in * ascending order. - * @author Ralph Allan Rice ralph.rice@gmail.com * + * @author Ralph Allan Rice ralph.rice@gmail.com */ -public class AscendingSimilarityScoreComparator implements Comparator -{ - /** - * Compares two similarity scores. - * @param x The first score to be compared. - * @param y The second score to be compared. - * @return a negative integer, zero, or a positive integer as the first score is less than, - * equal to, or greater than the second score. - */ - public int compare(SimilarityScore x, SimilarityScore y) { - double first = x.getScore(); - double second = y.getScore(); - if (first == second) { - return 0; - } - if (first < second) { - return -1; - } - return 1; - } - +public class AscendingSimilarityScoreComparator implements Comparator { + /** + * Compares two similarity scores. + * + * @param x The first score to be compared. + * @param y The second score to be compared. + * @return a negative integer, zero, or a positive integer as the first score is less than, + * equal to, or greater than the second score. + */ + public int compare(SimilarityScore x, SimilarityScore y) { + double first = x.getScore(); + double second = y.getScore(); + if (first == second) { + return 0; + } + if (first < second) { + return -1; + } + return 1; + } + } diff --git a/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/DescendingSimilarityScoreComparator.java b/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/DescendingSimilarityScoreComparator.java index a5124701..45a3af8c 100644 --- a/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/DescendingSimilarityScoreComparator.java +++ b/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/DescendingSimilarityScoreComparator.java @@ -7,10 +7,10 @@ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: - * + * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. - * + * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -28,28 +28,29 @@ /** * A comparator that allows SimilarityScore to be sorted in * descending order. - * @author Ralph Allan Rice ralph.rice@gmail.com * + * @author Ralph Allan Rice ralph.rice@gmail.com */ -public class DescendingSimilarityScoreComparator implements Comparator -{ - /** - * Compares two similarity scores. - * @param x The first score to be compared. - * @param y The second score to be compared. - * @return a negative integer, zero, or a positive integer as the first score is greater than, - * equal to, or less than the second score. - */public int compare(SimilarityScore x, SimilarityScore y) { - double first = x.getScore(); - double second = y.getScore(); - if (first == second) { - return 0; - } - if (first < second) { - return 1; - } - return -1; - } - +public class DescendingSimilarityScoreComparator implements Comparator { + /** + * Compares two similarity scores. + * + * @param x The first score to be compared. + * @param y The second score to be compared. + * @return a negative integer, zero, or a positive integer as the first score is greater than, + * equal to, or less than the second score. + */ + public int compare(SimilarityScore x, SimilarityScore y) { + double first = x.getScore(); + double second = y.getScore(); + if (first == second) { + return 0; + } + if (first < second) { + return 1; + } + return -1; + } + } diff --git a/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/JaroStrategy.java b/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/JaroStrategy.java index 34d5bd26..a71a73f8 100644 --- a/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/JaroStrategy.java +++ b/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/JaroStrategy.java @@ -7,10 +7,10 @@ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: - * + * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. - * + * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -25,31 +25,29 @@ /** * A strategy that uses the Jaro Distance to calculate the similarity of two strings. + * * @author Ralph Allan Rice ralph.rice@gmail.com * @see About Jaro Distance */ public class JaroStrategy implements SimilarityStrategy { - /** - * Calculates the similarity score of objects, where 0.0 implies absolutely no similarity - * and 1.0 implies absolute similarity. - * - * @param first The first string to compare. - * @param second The second string to compare. - * @return A number between 0.0 and 1.0. - */ + /** + * Calculates the similarity score of objects, where 0.0 implies absolutely no similarity + * and 1.0 implies absolute similarity. + * + * @param first The first string to compare. + * @param second The second string to compare. + * @return A number between 0.0 and 1.0. + */ public double score(String first, String second) { - String shorter; + String shorter; String longer; // Determine which String is longer. - if (first.length() > second.length()) - { + if (first.length() > second.length()) { longer = first.toLowerCase(); shorter = second.toLowerCase(); - } - else - { + } else { longer = second.toLowerCase(); shorter = first.toLowerCase(); } @@ -77,61 +75,55 @@ public double score(String first, String second) { // Calculate the distance. double dist = - (m1.length() / ((double)shorter.length()) + - m2.length() / ((double)longer.length()) + - (m1.length() - transpositions) / ((double)m1.length())) / 3.0; + (m1.length() / ((double) shorter.length()) + + m2.length() / ((double) longer.length()) + + (m1.length() - transpositions) / ((double) m1.length())) / 3.0; return dist; - - } - - /** - * Gets a set of matching characters between two strings. - * - * @param first The first string. - * @param second The second string. - * @param limit The maximum distance to consider. - * @return A string contain the set of common characters. - * @remarks Two characters from the first string and the second string are considered matching if the character's + + } + + /** + * Gets a set of matching characters between two strings. + * + * @param first The first string. + * @param second The second string. + * @param limit The maximum distance to consider. + * @return A string contain the set of common characters. + * @remarks Two characters from the first string and the second string are considered matching if the character's * respective positions are no farther than the limit value. - */ - private String getSetOfMatchingCharacterWithin(String first, String second, int limit) - { + */ + private String getSetOfMatchingCharacterWithin(String first, String second, int limit) { StringBuilder common = new StringBuilder(); StringBuilder copy = new StringBuilder(second); - for (int i = 0; i < first.length(); i++) - { + for (int i = 0; i < first.length(); i++) { char ch = first.charAt(i); boolean found = false; // See if the character is within the limit positions away from the original position of that character. - for (int j = Math.max(0, i - limit); !found && j < Math.min(i + limit, second.length()); j++) - { - if (copy.charAt(j) == ch) - { + for (int j = Math.max(0, i - limit); !found && j < Math.min(i + limit, second.length()); j++) { + if (copy.charAt(j) == ch) { found = true; common.append(ch); - copy.setCharAt(j,'*'); + copy.setCharAt(j, '*'); } } } return common.toString(); } - /** - * Calculates the number of transpositions between two strings. - * @param first The first string. - * @param second The second string. - * @return The number of transpositions between the two strings. - */ - private int transpositions(String first, String second) - { + /** + * Calculates the number of transpositions between two strings. + * + * @param first The first string. + * @param second The second string. + * @return The number of transpositions between the two strings. + */ + private int transpositions(String first, String second) { int transpositions = 0; - for (int i = 0; i < first.length(); i++) - { - if (first.charAt(i) != second.charAt(i)) - { + for (int i = 0; i < first.length(); i++) { + if (first.charAt(i) != second.charAt(i)) { transpositions++; } } diff --git a/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/JaroWinklerStrategy.java b/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/JaroWinklerStrategy.java index 7f3baa7e..7b34136a 100644 --- a/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/JaroWinklerStrategy.java +++ b/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/JaroWinklerStrategy.java @@ -7,10 +7,10 @@ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: - * + * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. - * + * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -25,24 +25,23 @@ /** * A strategy that uses the Jaro-Winkler Distance to calculate the similarity of two strings. - * + * * @author Ralph Allan Rice ralph.rice@gmail.com * @see About Jaro-Winkler Distance */ public class JaroWinklerStrategy extends JaroStrategy implements SimilarityStrategy { - final double DEFAULT_SCALING_FACTOR = 0.1; // This is the default scaling factor Winkler used. + final double DEFAULT_SCALING_FACTOR = 0.1; // This is the default scaling factor Winkler used. + + private final double scalingFactor; - private double scalingFactor; - /** * Constructs a new JaroWinklerStrategy instance. + * * @param scalingFactor The scaling factor between 0.00 and 0.25. If the scaling factor is greater than 0.25, the scaling factor is set to 0.25. */ - public JaroWinklerStrategy(double scalingFactor) - { - if (scalingFactor > 0.25) - { - scalingFactor = 0.25; + public JaroWinklerStrategy(double scalingFactor) { + if (scalingFactor > 0.25) { + scalingFactor = 0.25; } this.scalingFactor = scalingFactor; } @@ -50,21 +49,19 @@ public JaroWinklerStrategy(double scalingFactor) /** * Constructs a new JaroWinklerStrategy instance. */ - public JaroWinklerStrategy() - { + public JaroWinklerStrategy() { this.scalingFactor = DEFAULT_SCALING_FACTOR; } /** - * Calculates the similarity score of objects, where 0.0 implies absolutely no similarity - * and 1.0 implies absolute similarity. - * - * @param first The first string to compare. - * @param second The second string to compare. - * @return A number between 0.0 and 1.0. - */ - public double score(String first, String second) - { + * Calculates the similarity score of objects, where 0.0 implies absolutely no similarity + * and 1.0 implies absolute similarity. + * + * @param first The first string to compare. + * @param second The second string to compare. + * @return A number between 0.0 and 1.0. + */ + public double score(String first, String second) { double jaro = super.score(first, second); int cl = commonPrefixLength(first, second); @@ -78,25 +75,22 @@ public double score(String first, String second) } /** - * Calculates the number of characters from the beginning of the strings that match exactly one-to-one, + * Calculates the number of characters from the beginning of the strings that match exactly one-to-one, * up to a maximum of four (4) characters. - * @param first The first string. + * + * @param first The first string. * @param second The second string. * @return A number between 0 and 4. */ - private int commonPrefixLength(String first, String second) - { + private int commonPrefixLength(String first, String second) { String shorter; String longer; // Determine which string is longer. - if (first.length() > second.length()) - { + if (first.length() > second.length()) { longer = first.toLowerCase(); shorter = second.toLowerCase(); - } - else - { + } else { longer = second.toLowerCase(); shorter = first.toLowerCase(); } @@ -104,18 +98,16 @@ private int commonPrefixLength(String first, String second) int result = 0; // Iterate through the shorter string. - for (int i = 0; i < shorter.length(); i++) - { - if (shorter.charAt(i) != longer.charAt(i)) - { + for (int i = 0; i < shorter.length(); i++) { + if (shorter.charAt(i) != longer.charAt(i)) { break; } result++; } // Limit the result to 4. - return result > 4? 4: result; + return result > 4 ? 4 : result; } - + } diff --git a/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/SimilarityScore.java b/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/SimilarityScore.java index c0e2c707..7c55af7d 100644 --- a/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/SimilarityScore.java +++ b/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/SimilarityScore.java @@ -7,10 +7,10 @@ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: - * + * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. - * + * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -25,67 +25,72 @@ /** * A value object contains a similarity score. + * * @author Ralph Allan Rice ralph.rice@gmail.com - * */ public class SimilarityScore { - - private String key; - private double score; - - /** - * Constructs a similarity score. - * @param key The string key. - * @param score The score value. - */ - public SimilarityScore(String key, double score) { - this.key = key; - this.score = score; - } - - /** - * Gets the key for this score. - * @return A string. - */ - public String getKey() { - return this.key; - } - - /** - * Gets the value of the score. - * @return A double. - */ - public double getScore() { - return this.score; - } + private final String key; + private final double score; + + /** + * Constructs a similarity score. + * + * @param key The string key. + * @param score The score value. + */ + + public SimilarityScore(String key, double score) { + this.key = key; + this.score = score; + } + + /** + * Gets the key for this score. + * + * @return A string. + */ + public String getKey() { + return this.key; + } + + /** + * Gets the value of the score. + * + * @return A double. + */ + public double getScore() { + return this.score; + } + + + /** + * Returns the hash code for this object. + * + * @return An integer representing the hash code. + */ + public int hashCode() { + int hash = 11; + hash = 23 * hash + key.hashCode(); + hash = 23 * hash + (int) (score * 1000000); + return hash; + } + + /** + * Determines if the supplied object equals this object. + * + * @return True if the keys and scores match between the two objects. Otherwise false. + */ + @Override + public boolean equals(Object o) { + if ((o == null) || (o.getClass() != this.getClass())) { + return false; + } + SimilarityScore other = (SimilarityScore) o; + + return this.key.equals(other.key) + && this.score == other.score; + } + - - /** - * Returns the hash code for this object. - * @return An integer representing the hash code. - */ - public int hashCode() { - int hash = 11; - hash = 23 * hash + key.hashCode(); - hash = 23 * hash + (int)(score * 1000000); - return hash; - } - - /** - * Determines if the supplied object equals this object. - * @return True if the keys and scores match between the two objects. Otherwise false. - */ - @Override - public boolean equals(Object o) { - if((o == null) || (o.getClass() != this.getClass())) { - return false; - } - SimilarityScore other=(SimilarityScore)o; - - return this.key.equals(other.key) - && this.score == other.score; - } - - } diff --git a/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/SimilarityStrategy.java b/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/SimilarityStrategy.java index afba0a8c..7a34d565 100644 --- a/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/SimilarityStrategy.java +++ b/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/SimilarityStrategy.java @@ -7,10 +7,10 @@ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: - * + * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. - * + * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -29,13 +29,13 @@ */ public interface SimilarityStrategy { - /** - * Calculates the similarity score of objects, where 0.0 implies absolutely no similarity - * and 1.0 implies absolute similarity. - * - * @param first The first string to compare. - * @param second The second string to compare. - * @return A number between 0.0 and 1.0. - */ + /** + * Calculates the similarity score of objects, where 0.0 implies absolutely no similarity + * and 1.0 implies absolute similarity. + * + * @param first The first string to compare. + * @param second The second string to compare. + * @return A number between 0.0 and 1.0. + */ double score(String first, String second); } diff --git a/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/StringSimilarityService.java b/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/StringSimilarityService.java index 90765cd5..beefe14f 100644 --- a/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/StringSimilarityService.java +++ b/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/StringSimilarityService.java @@ -7,10 +7,10 @@ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: - * + * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. - * + * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -28,55 +28,61 @@ /** * A service that performs string similarity calculations. - * @author Ralph Allan Rice ralph.rice@gmail.com * + * @author Ralph Allan Rice ralph.rice@gmail.com */ public interface StringSimilarityService { - /** + /** * Calculates all similarity scores for a given set of features. + * * @param features The list of features. - * @param target The target string to compare against the features. + * @param target The target string to compare against the features. * @return A list of similarity scores. */ List scoreAll(List features, String target); - + /** * Calculates the similarity score of a single feature. + * * @param feature The feature string to compare. - * @param target The target string to compare against the feature. + * @param target The target string to compare against the feature. * @return The similarity score between the feature and target. */ double score(String feature, String target); - + /** * Finds the feature within a set of given features that best match the target string. + * * @param features A list of strings containing the features to compare. - * @param target The target string to compare against the features. + * @param target The target string to compare against the features. * @return A SimilarityScore that has the highest score value amongst the features. */ SimilarityScore findTop(List features, String target); /** * Finds the feature within a set of given features that best match the target string. - * @param features A list of strings containing the features to compare. - * @param target The target string to compare against the features. - * @param comparator A comparator that is used sort the scores. + * + * @param features A list of strings containing the features to compare. + * @param target The target string to compare against the features. + * @param comparator A comparator that is used sort the scores. * @return A SimilarityScore that has the top value amongst the features, according to the comparator. */ SimilarityScore findTop(List features, String target, Comparator comparator); // added by Gernot Starke: + /** * Finds the n features within a set of given features that best match the target string. + * * @param features A list of strings containing the features to compare. - * @param target The target string to compare against the features. - * @param n The (maximum) number of hits to be returned. + * @param target The target string to compare against the features. + * @param n The (maximum) number of hits to be returned. * @return A list of SimilarityScore instances having the top values amongst the features, - * according to the comparator + * according to the comparator */ - List findBestN( List features, String target, int n ); + List findBestN(List features, String target, int n); } diff --git a/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/StringSimilarityServiceImpl.java b/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/StringSimilarityServiceImpl.java index 08d0ee0e..d47ec891 100644 --- a/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/StringSimilarityServiceImpl.java +++ b/htmlSanityCheck-core/src/main/java/net/ricecode/similarity/StringSimilarityServiceImpl.java @@ -7,10 +7,10 @@ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: - * + * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. - * + * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -31,16 +31,18 @@ /** * An implementation of StringSimilarityService. + * * @author Ralph Allan Rice ralph.rice@gmail.com * @see StringSimilarityService */ public class StringSimilarityServiceImpl implements StringSimilarityService { - private final SimilarityStrategy strategy; + private final SimilarityStrategy strategy; /** * Creates a similarity calculator instance. + * * @param strategy The similarity strategy to use when calculating similarity scores. */ public StringSimilarityServiceImpl(SimilarityStrategy strategy) { @@ -49,72 +51,74 @@ public StringSimilarityServiceImpl(SimilarityStrategy strategy) { /** * Calculates all similarity scores for a given set of features. + * * @param features The list of features. - * @param target The target string to compare against the features. + * @param target The target string to compare against the features. * @return A list of similarity scores. */ - public List scoreAll(List features, String target) - { + public List scoreAll(List features, String target) { ArrayList scores = new ArrayList(); - - for(String feature: features) { - double score = strategy.score(feature, target); - scores.add(new SimilarityScore(feature, score)); + + for (String feature : features) { + double score = strategy.score(feature, target); + scores.add(new SimilarityScore(feature, score)); } - + return scores; } /** * Calculates the similarity score of a single feature. + * * @param feature The feature string to compare. - * @param target The target string to compare against the feature. + * @param target The target string to compare against the feature. * @return The similarity score between the feature and target. */ - public double score(String feature, String target) - { + public double score(String feature, String target) { return strategy.score(feature, target); } /** * Finds the feature within a set of given features that best match the target string. + * * @param features A list of strings containing the features to compare. - * @param target The target string to compare against the features. + * @param target The target string to compare against the features. * @return The similarity score with the highest value. */ - public SimilarityScore findTop(List features, String target) - { - return findTop(features, target, new DescendingSimilarityScoreComparator()); + public SimilarityScore findTop(List features, String target) { + return findTop(features, target, new DescendingSimilarityScoreComparator()); } - + /** * Finds the feature within a set of given features that best match the target string. - * @param features A list of strings containing the features to compare. - * @param target The target string to compare against the features. - * @param comparator A comparator that is used sort the scores. + * + * @param features A list of strings containing the features to compare. + * @param target The target string to compare against the features. + * @param comparator A comparator that is used sort the scores. * @return A SimilarityScore that has the top value amongst the features, according to the comparator. */ - public SimilarityScore findTop(List features, String target, Comparator comparator) - { - if (features.size() == 0) { - return null; - } - List scores= scoreAll(features, target); - Collections.sort(scores, comparator); - return scores.get(0); + public SimilarityScore findTop(List features, String target, Comparator comparator) { + if (features.size() == 0) { + return null; + } + List scores = scoreAll(features, target); + Collections.sort(scores, comparator); + return scores.get(0); } // added by Gernot Starke: + /** * Finds the n features within a set of given features that best match the target string. + * * @param features A list of strings containing the features to compare. - * @param target The target string to compare against the features. - * @param n The (maximum) number of hits to be returned. + * @param target The target string to compare against the features. + * @param n The (maximum) number of hits to be returned. * @return A list of SimilarityScore instances having the top values amongst the features, - * according to the comparator + * according to the comparator */ - public List findBestN( List features, String target, int n) { + public List findBestN(List features, String target, int n) { List result = new ArrayList(); if ((features.size() > 0) && (n >= 1)) { diff --git a/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/Configuration.java b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/Configuration.java new file mode 100644 index 00000000..cc81f81f --- /dev/null +++ b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/Configuration.java @@ -0,0 +1,319 @@ +package org.aim42.htmlsanitycheck; + +import org.aim42.htmlsanitycheck.check.AllCheckers; +import org.aim42.htmlsanitycheck.tools.Web; + +import java.io.File; +import java.util.Collection; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Set; + +/** + * Handles (and can verify) configuration options. + *

+ * Implemented as REGISTRY pattern + *

+ *

+ * Explanation for configuring http status codes: + * The standard http status codes are defined in class @link NetUtil and can + * be overwritten by configuration: + *

+ * Example: You want 503 to be ok instead of error: + * httpSuccessCodes = [503] + *

+ * During configuration initialization, the value(s) of httpSuccessCodes will be: + * 1.) set-added to httpSuccessCodes, + * 2.) set-subtracted from the warnings and errors. + *

+ *

+ * This class needs to be updated if additional configuration options are added. + *

+ *

+ * Ideas for additional config options: + * ------------------------------------ + * - verbosity level on console during checks + */ +public class Configuration { + /***************************************** + * configuration item names + * + * NEVER use any string constants for configuration + * item names within source code! + ****************************************/ + private static final String ITEM_NAME_sourceDocuments = "sourceDocuments"; + private static final String ITEM_NAME_sourceDir = "sourceDir"; + private static final String ITEM_NAME_checkingResultsDir = "checkingResultsDir"; + private static final String ITEM_NAME_junitResultsDir = "junitResultsDir"; + private static final String ITEM_NAME_consoleReport = "consoleReport"; + private static final String ITEM_NAME_failOnErrors = "failOnErrors"; + private static final String ITEM_NAME_httpConnectionTimeout = "httpConnectionTimeout"; + private static final String ITEM_NAME_ignoreLocalhost = "ignoreLocalHost"; + private static final String ITEM_NAME_ignoreIPAddresses = "ignoreIPAddresses"; + private static final String ITEM_NAME_httpWarningCodes = "httpWarningCodes"; + private static final String ITEM_NAME_httpErrorCodes = "httpErrorCodes"; + private static final String ITEM_NAME_httpSuccessCodes = "httpSuccessCodes"; + private static final String ITEM_NAME_urlsToExclude = "urlsToExclude"; + private static final String ITEM_NAME_hostsToExclude = "hostsToExclude"; + private static final String ITEM_NAME_prefixOnlyHrefExtensions = "prefixOnlyHrefExtensions"; + private static final String ITEM_NAME_checksToExecute = "checksToExecute"; + /*************************** + * private member + **************************/ + private final Map configurationItems = new LinkedHashMap(); + + public Configuration() { + + this.configurationItems.put(ITEM_NAME_httpErrorCodes, Web.HTTP_ERROR_CODES); + this.configurationItems.put(ITEM_NAME_httpSuccessCodes, Web.HTTP_SUCCESS_CODES); + this.configurationItems.put(ITEM_NAME_httpWarningCodes, Web.HTTP_WARNING_CODES); + + this.configurationItems.put(ITEM_NAME_httpConnectionTimeout, 5000);// 5 secs as default timeout + this.configurationItems.put(ITEM_NAME_ignoreIPAddresses, false);// warning if numerical IP addresses + this.configurationItems.put(ITEM_NAME_ignoreLocalhost, false);// warning if localhost-URLs + + this.configurationItems.put(ITEM_NAME_prefixOnlyHrefExtensions, Web.POSSIBLE_EXTENSIONS); + + this.configurationItems.put(ITEM_NAME_checksToExecute, AllCheckers.checkerClazzes); + } + + public static String getITEM_NAME_sourceDocuments() { + return ITEM_NAME_sourceDocuments; + } + + public static String getITEM_NAME_sourceDir() { + return ITEM_NAME_sourceDir; + } + + public static String getITEM_NAME_checkingResultsDir() { + return ITEM_NAME_checkingResultsDir; + } + + public static String getITEM_NAME_junitResultsDir() { + return ITEM_NAME_junitResultsDir; + } + + public static String getITEM_NAME_consoleReport() { + return ITEM_NAME_consoleReport; + } + + public static String getITEM_NAME_failOnErrors() { + return ITEM_NAME_failOnErrors; + } + + public static String getITEM_NAME_httpConnectionTimeout() { + return ITEM_NAME_httpConnectionTimeout; + } + + public static String getITEM_NAME_ignoreLocalhost() { + return ITEM_NAME_ignoreLocalhost; + } + + public static String getITEM_NAME_ignoreIPAddresses() { + return ITEM_NAME_ignoreIPAddresses; + } + + public static String getITEM_NAME_httpWarningCodes() { + return ITEM_NAME_httpWarningCodes; + } + + public static String getITEM_NAME_httpErrorCodes() { + return ITEM_NAME_httpErrorCodes; + } + + public static String getITEM_NAME_httpSuccessCodes() { + return ITEM_NAME_httpSuccessCodes; + } + + public static String getITEM_NAME_urlsToExclude() { + return ITEM_NAME_urlsToExclude; + } + + public static String getITEM_NAME_hostsToExclude() { + return ITEM_NAME_hostsToExclude; + } + + public static String getITEM_NAME_prefixOnlyHrefExtensions() { + return ITEM_NAME_prefixOnlyHrefExtensions; + } + + public static String getITEM_NAME_checksToExecute() { + return ITEM_NAME_checksToExecute; + } + + /** + * retrieve a single configuration item + * + * @param itemName + * @return + */ + public synchronized Object getConfigItemByName(final String itemName) { + return configurationItems.get(itemName); + } + + public Set getConfigItemByNameSetOfIntegers(final String itemName) { + Object rawConfig = getConfigItemByName(itemName); + if (rawConfig instanceof Set ) { + return (Set) rawConfig; + } + throw new IllegalArgumentException("The Configuration property \"" + itemName + "\" should be a set of integers"); + } + + /** + * convenience method for simplified testing + */ + public synchronized void addSourceFileConfiguration(File srcDir, Collection srcDocs) { + addConfigurationItem(ITEM_NAME_sourceDir, srcDir); + addConfigurationItem(ITEM_NAME_sourceDocuments, srcDocs); + } + + /** + * @return true if item is already present, false otherwise + */ + public boolean checkIfItemPresent(String itemName) { + return configurationItems.get(itemName) != null; + + } + + /** + * @return the number of configuration items + */ + public int nrOfConfigurationItems() { + return configurationItems.size(); + } + + /** + * add a single configuration item, unless its value is null + * + * @param itemName + * @param itemValue + */ + public void addConfigurationItem(String itemName, Object itemValue) { + if (itemValue != null) { + configurationItems.put(itemName, itemValue); + } + + } + + /** + * overwrites httpSuccessCodes configuration + */ + public void overwriteHttpSuccessCodes(Collection additionalSuccessCodes) { + final Set errCodes = (Set) getConfigItemByName(Configuration.getITEM_NAME_httpErrorCodes()); + final Set warnCodes = (Set) getConfigItemByName(Configuration.getITEM_NAME_httpWarningCodes()); + final Set successCodes = (Set) getConfigItemByName(Configuration.getITEM_NAME_httpSuccessCodes()); + + additionalSuccessCodes.forEach(co -> { + successCodes.add(co); + errCodes.remove(co); + warnCodes.remove(co); + } + ); + + updateSuccessWarningErrorCodesConfiguration(errCodes, warnCodes, successCodes); + } + + /** + * overwrites httpWarningCodes configuration + */ + public void overwriteHttpWarningCodes(Collection additionalWarningCodes) { + final Set errCodes = (Set) getConfigItemByName(Configuration.getITEM_NAME_httpErrorCodes()); + final Set warnCodes = (Set) getConfigItemByName(Configuration.getITEM_NAME_httpWarningCodes()); + final Set successCodes = (Set) getConfigItemByName(Configuration.getITEM_NAME_httpSuccessCodes()); + + additionalWarningCodes.forEach(co -> { + successCodes.add(co); + errCodes.remove(co); + warnCodes.remove(co); + } + ); + + updateSuccessWarningErrorCodesConfiguration(errCodes, warnCodes, successCodes); + } + + /** + * overwrites httpErrorCodes configuration + */ + public void overwriteHttpErrorCodes(Collection additionalErrorCodes) { + final Set errCodes = (Set) getConfigItemByName(Configuration.getITEM_NAME_httpErrorCodes()); + final Set warnCodes = (Set) getConfigItemByName(Configuration.getITEM_NAME_httpWarningCodes()); + final Set successCodes = (Set) getConfigItemByName(Configuration.getITEM_NAME_httpSuccessCodes()); + + additionalErrorCodes.forEach(co -> { + successCodes.add(co); + errCodes.remove(co); + warnCodes.remove(co); + } + ); + + updateSuccessWarningErrorCodesConfiguration(errCodes, warnCodes, successCodes); + } + + public void updateSuccessWarningErrorCodesConfiguration(Object errCodes, Object warnCodes, Object successCodes) { + addConfigurationItem(Configuration.getITEM_NAME_httpErrorCodes(), errCodes); + addConfigurationItem(Configuration.getITEM_NAME_httpWarningCodes(), warnCodes); + addConfigurationItem(Configuration.getITEM_NAME_httpSuccessCodes(), successCodes); + } + + /** + * overwrites prefixOnlyHrefExtensions + */ + public void overwritePrefixOnlyHrefExtensions(Collection prefixesToBeConsidered) { + addConfigurationItem(Configuration.getITEM_NAME_prefixOnlyHrefExtensions(), prefixesToBeConsidered); + } + + /** + * checks plausibility of configuration: + * We need at least one html file as input, maybe several + * + * @param configuration instance + *

+ * srcDocs needs to be of type {@link FileCollection} + * to be Gradle-compliant + */ + public Boolean isValid() throws MisconfigurationException { + + // we need at least srcDir and srcDocs!! + File srcDir = (File) getConfigItemByName(Configuration.getITEM_NAME_sourceDir()); + Set srcDocs = (Set) getConfigItemByName(Configuration.getITEM_NAME_sourceDocuments()); + + // cannot check if source director is null (= unspecified) + if ((srcDir == null)) { + throw new MisconfigurationException("source directory must not be null"); + } + + + if ((!srcDir.exists())) { + throw new MisconfigurationException("given sourceDir " + srcDir + " does not exist."); + } + + + // cannot check if both input params are null + if (srcDocs == null) { + throw new MisconfigurationException("source documents must not be null"); + } + + + // empty SrcDocs + if (srcDocs.isEmpty()) { + throw new MisconfigurationException("source documents must not be empty"); + } + + + Object checksToExecute = getConfigItemByName(Configuration.getITEM_NAME_checksToExecute()); + if (!(checksToExecute == null || checksToExecute instanceof Collection)) { + throw new MisconfigurationException("checks to execute have to be a non empty collection"); + } + + + // if no exception has been thrown until now, + // the configuration seems to be valid.. + return true; + } + + @Override + public String toString() { + return "Configuration{" + "configurationItems=" + configurationItems + "}"; + } + +} diff --git a/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/MisconfigurationException.java b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/MisconfigurationException.java new file mode 100644 index 00000000..583b32b3 --- /dev/null +++ b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/MisconfigurationException.java @@ -0,0 +1,35 @@ +package org.aim42.htmlsanitycheck; + +// see end-of-file for license information + + +import java.io.File; + +class MisconfigurationException extends Exception { + + + public MisconfigurationException(String message, File srcDir) { + super(message + ": " + srcDir.getAbsolutePath()); + } + + public MisconfigurationException(String message) { + super(message); + } +} + +/*======================================================================== + Copyright Gernot Starke and aim42 contributors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an + "AS IS" BASIS,WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + ========================================================================*/ \ No newline at end of file diff --git a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/ProductVersion.groovy b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/ProductVersion.java similarity index 82% rename from htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/ProductVersion.groovy rename to htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/ProductVersion.java index 65951abc..c5f39464 100644 --- a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/ProductVersion.groovy +++ b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/ProductVersion.java @@ -1,7 +1,11 @@ -package org.aim42.htmlsanitycheck +package org.aim42.htmlsanitycheck; -import org.slf4j.Logger -import org.slf4j.LoggerFactory +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.net.URL; +import java.util.Properties; /** * provides the current product version, @@ -14,13 +18,13 @@ public class ProductVersion { private static final Logger logger = LoggerFactory.getLogger(ProductVersion.class); public static String getVersion() { - try{ + try { final URL RESOURCE = ProductVersion.class.getClassLoader().getResource("product-version.properties"); - Properties props = new Properties() - props.load(RESOURCE.openConnection().inputStream) + Properties props = new Properties(); + props.load(RESOURCE.openConnection().getInputStream()); return props.getProperty("version"); } catch (IOException E) { - logger.debug("ProductVersion cannot be obtained due to IOException.") + logger.debug("ProductVersion cannot be obtained due to IOException."); } return "[unknown]"; } diff --git a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/AllCheckers.groovy b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/check/AllCheckers.java similarity index 53% rename from htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/AllCheckers.groovy rename to htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/check/AllCheckers.java index 2e70e655..7850aacf 100644 --- a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/check/AllCheckers.groovy +++ b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/check/AllCheckers.java @@ -1,4 +1,7 @@ -package org.aim42.htmlsanitycheck.check +package org.aim42.htmlsanitycheck.check; + +import java.util.Arrays; +import java.util.LinkedHashSet; /************************************************************************ * This is free software - without ANY guarantee! @@ -19,17 +22,15 @@ * limitations under the License. * *********************************************************************** */ - -class AllCheckers { - - public final static LinkedHashSet checkerClazzes = - [BrokenCrossReferencesChecker, - BrokenHttpLinksChecker, - DuplicateIdChecker, - ImageMapChecker, - MissingAltInImageTagsChecker, - MissingImageFilesChecker, - MissingLocalResourcesChecker].toSet() - - -} \ No newline at end of file +public class AllCheckers { + public static final LinkedHashSet> checkerClazzes = + new LinkedHashSet>( + Arrays.asList( + BrokenCrossReferencesChecker.class, + BrokenHttpLinksChecker.class, + DuplicateIdChecker.class, + ImageMapChecker.class, + MissingAltInImageTagsChecker.class, + MissingImageFilesChecker.class, + MissingLocalResourcesChecker.class)); +} diff --git a/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/check/BrokenCrossReferencesChecker.java b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/check/BrokenCrossReferencesChecker.java new file mode 100644 index 00000000..632edda6 --- /dev/null +++ b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/check/BrokenCrossReferencesChecker.java @@ -0,0 +1,132 @@ +package org.aim42.htmlsanitycheck.check; + +import org.aim42.htmlsanitycheck.Configuration; +import org.aim42.htmlsanitycheck.collect.SingleCheckResults; +import org.aim42.htmlsanitycheck.html.HtmlPage; +import org.aim42.htmlsanitycheck.tools.Web; + +import java.io.UnsupportedEncodingException; +import java.net.URLDecoder; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +public class BrokenCrossReferencesChecker extends SuggestingChecker { + private List listOfIds; + private List hrefList; + private Set hrefSet; + + public BrokenCrossReferencesChecker(Configuration pConfig) { + super(pConfig); + } + + @Override + protected void initCheckingResultsDescription() { + getCheckingResults().setWhatIsChecked("Broken Internal Links Check"); + getCheckingResults().setSourceItemName("href"); + getCheckingResults().setTargetItemName("missing id"); + } + + @Override + protected void setValidPossibilities() { + setValidPossibilities(listOfIds); + } + + @Override + protected SingleCheckResults check(final HtmlPage pageToCheck) { + //get list of all a-tags "(hrefList); + + // get list of all id="XYZ" + listOfIds = pageToCheck.getAllIdStrings(); + + checkAllInternalLinks(); + + return getCheckingResults(); + } + + /** + * check all internal links against the existing id's + */ + private void checkAllInternalLinks() { + // for all hrefSet check if the corresponding id exists + + hrefSet.forEach(this::checkSingleInternalLink); + + } + + /** + * check a single internal link (href) against the existing id's within + * the html document + */ + private void checkSingleInternalLink(String href) { + getCheckingResults().incNrOfChecks(); + if (Web.containsInvalidChars(href)) { + // we found link with illegal characters! + String findingText = "link \"" + href + "\" contains illegal characters"; + // now count occurrences - how often is it referenced + int nrOfReferences = countNrOfReferences(href); + if (nrOfReferences > 1) { + findingText += ", reference count: " + nrOfReferences; + } + + getCheckingResults().newFinding(findingText, nrOfReferences); + } else if (Web.isCrossReference(href)) { + + // bookkeeping: + getCheckingResults().incNrOfChecks(); + + doesLinkTargetExist(href); + } + + } + + /** + * check if the id for the href parameter exists + * + * @param href = "#XYZ" in id="XYZ" + */ + private void doesLinkTargetExist(String href) { + if (href.equals("#")) { + return; + } + + // strip href of its leading "#" + String linkTarget = (href.startsWith("#")) ? href.substring(1) : href; + // fragment can be URL-encoded + try { + linkTarget = URLDecoder.decode(linkTarget, "UTF-8"); + } catch (UnsupportedEncodingException e) { + throw new RuntimeException(e); + } + + if (!listOfIds.contains(linkTarget)) { + // we found a broken link! + addBrokenLinkToResults(linkTarget, href); + } + + } + + /** + * bookkeeping the broken links that we found + */ + private void addBrokenLinkToResults(String linkTarget, String href) { + String findingText = "link target \"" + linkTarget + "\" missing"; + + // now count occurrences - how often is it referenced + int nrOfReferences = countNrOfReferences(href); + if (nrOfReferences > 1) { + findingText += ", reference count: " + nrOfReferences; + } + + + // determine suggestions "what could have been meant?" + + getCheckingResults().newFinding(findingText, nrOfReferences); + } + + private int countNrOfReferences(String href) { + return (int) hrefList.stream().filter(href::equals).count(); + } +} diff --git a/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/check/BrokenHttpLinksChecker.java b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/check/BrokenHttpLinksChecker.java new file mode 100644 index 00000000..acb0c5a8 --- /dev/null +++ b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/check/BrokenHttpLinksChecker.java @@ -0,0 +1,246 @@ +package org.aim42.htmlsanitycheck.check; + +import org.aim42.htmlsanitycheck.Configuration; +import org.aim42.htmlsanitycheck.collect.Finding; +import org.aim42.htmlsanitycheck.collect.SingleCheckResults; +import org.aim42.htmlsanitycheck.html.HtmlPage; +import org.aim42.htmlsanitycheck.tools.Web; +import org.aim42.net.TrustAllCertificates; + +import java.io.IOException; +import java.net.HttpURLConnection; +import java.net.MalformedURLException; +import java.net.URL; +import java.net.UnknownHostException; +import java.util.Collection; +import java.util.Set; + + +/** + * Check html anchor href attributes + * @see https://www.w3schools.com/tags/att_a_href.asp + */ +class BrokenHttpLinksChecker extends Checker { + + // get the (configured) statusCodes, just syntactic sugar... + private final Collection successCodes; + private final Collection warningCodes; + private final Collection errorCodes; + // all href attributes with http(s) protocol, + // including potential duplicates + // need that to calculate "nrOfOccurrences" + // the pure http/https-hrefs a set, duplicates are removed here + private Set hrefSet; + + + BrokenHttpLinksChecker(Configuration pConfig) { + super(pConfig); + + errorCodes = getMyConfig().getConfigItemByNameSetOfIntegers(Configuration.getITEM_NAME_httpErrorCodes()); + warningCodes = getMyConfig().getConfigItemByNameSetOfIntegers(Configuration.getITEM_NAME_httpWarningCodes()); + successCodes = getMyConfig().getConfigItemByNameSetOfIntegers(Configuration.getITEM_NAME_httpSuccessCodes()); + } + + @Override + protected void initCheckingResultsDescription() { + getCheckingResults().setWhatIsChecked("External Links Check"); + getCheckingResults().setSourceItemName("anchor href attribute"); + getCheckingResults().setTargetItemName("broken external link"); + + } + + @Override + protected SingleCheckResults check(final HtmlPage pageToCheck) { + + //get set of all a-tags " + * No constructor is defined, allowing for arbitrary "named parameters" + * in constructor calls. + *

+ * While checking, every subclass builds an instance of {@link SingleCheckResults} + * + * @author Gernot Starke + */ +public abstract class Checker { + private SingleCheckResults checkingResults; + private Configuration myConfig; + + public Checker(Configuration pConfig) { + this.myConfig = pConfig; + } + + /** + * * template method for performing a single type of checks on the given @see HtmlPage. + *

+ * Prerequisite: pageToCheck has been successfully parsed, + * prior to constructing this Checker instance. + **/ + public SingleCheckResults performCheck(final HtmlPage pageToCheck) { + // assert non-null htmlPage + assert pageToCheck != null; + + checkingResults = new SingleCheckResults(); + + // description is set by subclasses + initCheckingResultsDescription(); + + return check(pageToCheck);// <1> delegate check() to subclass + } + + /** + * Initialize with suitable description. + */ + protected abstract void initCheckingResultsDescription(); + + /** + * Perform a particular kind of checks, i.e. missing-local-images-check + *

+ * Called by {@link #performCheck()} as part of the template method pattern. + * + * @return collected results of this Checker instance + */ + protected abstract SingleCheckResults check(final HtmlPage pageToCheck); + + public SingleCheckResults getCheckingResults() { + return checkingResults; + } + + public void setCheckingResults(SingleCheckResults checkingResults) { + this.checkingResults = checkingResults; + } + + public Configuration getMyConfig() { + return myConfig; + } + + public void setMyConfig(Configuration myConfig) { + this.myConfig = myConfig; + } +} diff --git a/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/check/CheckerCreator.java b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/check/CheckerCreator.java new file mode 100644 index 00000000..2fa8b9d6 --- /dev/null +++ b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/check/CheckerCreator.java @@ -0,0 +1,62 @@ +package org.aim42.htmlsanitycheck.check; + +import org.aim42.htmlsanitycheck.Configuration; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Collection; +import java.util.Set; +import java.util.stream.Collectors; + +/** + * abstract factory to create Checker instances + */ +public class CheckerCreator { + private static final Logger logger = LoggerFactory.getLogger(CheckerCreator.class); + + public static Set createCheckerClassesFrom(final Collection> checkerClasses, final Configuration pConfig) { + + return checkerClasses.stream() + .map(checkerClass -> CheckerCreator.createSingleChecker(checkerClass, pConfig)) + .collect(Collectors.toSet()); + } + + private static boolean isCase(Class caseValue, Class switchValue) { + if (switchValue != null) { + return caseValue.isAssignableFrom(switchValue); + } + return false; + } + + public static Checker createSingleChecker(final Class checkerClass, final Configuration pConfig) { + Checker checker; + + // switch over all possible Checker classes + // in case of new Checkers, this has to be adapted, + // as Checker constructors will differ in minor details! + + // clearly violates the open-close principle + + if (isCase(BrokenCrossReferencesChecker.class, checkerClass)) { + checker = new BrokenCrossReferencesChecker(pConfig); + } else if (isCase(BrokenHttpLinksChecker.class, checkerClass)) { + checker = new BrokenHttpLinksChecker(pConfig); + } else if (isCase(DuplicateIdChecker.class, checkerClass)) { + checker = new DuplicateIdChecker(pConfig); + } else if (isCase(ImageMapChecker.class, checkerClass)) { + checker = new ImageMapChecker(pConfig); + } else if (isCase(MissingAltInImageTagsChecker.class, checkerClass)) { + checker = new MissingAltInImageTagsChecker(pConfig); + } else if (isCase(MissingImageFilesChecker.class, checkerClass)) { + checker = new MissingImageFilesChecker(pConfig); + } else if (isCase(MissingLocalResourcesChecker.class, checkerClass)) { + checker = new MissingLocalResourcesChecker(pConfig); + } else { + logger.warn("unknown Checker " + checkerClass.toString()); + throw new UnknownCheckerException(checkerClass.toString()); + } + + return checker; + + } +} diff --git a/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/check/DuplicateIdChecker.java b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/check/DuplicateIdChecker.java new file mode 100644 index 00000000..d1b902c6 --- /dev/null +++ b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/check/DuplicateIdChecker.java @@ -0,0 +1,84 @@ +package org.aim42.htmlsanitycheck.check; + +import org.aim42.htmlsanitycheck.Configuration; +import org.aim42.htmlsanitycheck.collect.SingleCheckResults; +import org.aim42.htmlsanitycheck.html.HtmlElement; +import org.aim42.htmlsanitycheck.html.HtmlPage; + +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +public class DuplicateIdChecker extends Checker { + private Set idStringsSet; + private List idStringsList; + + public DuplicateIdChecker(Configuration pConfig) { + super(pConfig); + } + + /** + * find all tags with specific id value + * + * @param id + * @param allTags List of tags containing id-attribute + */ + public static List getAllTagsWithSpecificId(final String idString, List allTags) { + return allTags.stream().filter(htmlElement -> htmlElement.getIdAttribute().equals(idString)).collect(Collectors.toList()); + + } + + @Override + protected void initCheckingResultsDescription() { + getCheckingResults().setWhatIsChecked("Duplicate Definition of id Check"); + getCheckingResults().setSourceItemName("id"); + getCheckingResults().setTargetItemName("duplicate id"); + } + + @Override + protected SingleCheckResults check(final HtmlPage pageToCheck) { + + //get list of all tagsWithId '<... id="XYZ"...' in html file + + idStringsList = pageToCheck.getAllIdStrings(); + idStringsSet = new HashSet<>(idStringsList); + + checkForDuplicateIds(idStringsSet); + + return getCheckingResults(); + + } + + private void checkForDuplicateIds(Set idStringsSet) { + idStringsSet.forEach(oneIdString -> checkForDuplicateDefinition(oneIdString)); + } + + private void checkForDuplicateDefinition(final String idString) { + getCheckingResults().incNrOfChecks(); + + int nrOfOccurrences = (int) idStringsList.stream().filter(it -> it.equals(idString)).count(); + + // duplicate, IFF idString appears more than once in idStringsList + if (nrOfOccurrences > 1) { + getCheckingResults().newFinding("id \"" + idString + "\" has " + nrOfOccurrences + " definitions."); + } + + } + + public Set getIdStringsSet() { + return idStringsSet; + } + + public void setIdStringsSet(Set idStringsSet) { + this.idStringsSet = idStringsSet; + } + + public List getIdStringsList() { + return idStringsList; + } + + public void setIdStringsList(List idStringsList) { + this.idStringsList = idStringsList; + } +} diff --git a/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/check/ImageMapChecker.java b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/check/ImageMapChecker.java new file mode 100644 index 00000000..3de60b03 --- /dev/null +++ b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/check/ImageMapChecker.java @@ -0,0 +1,186 @@ +package org.aim42.htmlsanitycheck.check; + +import org.aim42.htmlsanitycheck.Configuration; +import org.aim42.htmlsanitycheck.collect.Finding; +import org.aim42.htmlsanitycheck.collect.SingleCheckResults; +import org.aim42.htmlsanitycheck.html.HtmlElement; +import org.aim42.htmlsanitycheck.html.HtmlPage; +import org.aim42.htmlsanitycheck.tools.Web; + +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +/** + * principal checks on imageMap usage: + *

+ * 1.) for every usemap-reference there is one map + * 2.) every map is referenced by at least one image + * 3.) every every map name is unique + * 4.) every area-tag has one non-empty href attribute + * 5.) every href points to valid target (broken-links check) + *

+ * see also: http://www.w3schools.com/tags/tag_map.asp + **/ +public class ImageMapChecker extends Checker { + private List maps; + private List mapNames; + private List imagesWithUsemapRefs; + private List usemapRefs; + private List listOfIds; + private String findingText; + private HtmlPage pageToCheck; + + + public ImageMapChecker(Configuration pConfig) { + super(pConfig); + } + + @Override + protected void initCheckingResultsDescription() { + getCheckingResults().setWhatIsChecked("Consistency of ImageMaps"); + getCheckingResults().setSourceItemName("imageMap"); + getCheckingResults().setTargetItemName("map/area and usemap-references"); + } + + @Override + protected SingleCheckResults check(final HtmlPage pageToCheck) { + + this.pageToCheck = pageToCheck; + + readImageMapAttributesFromHtml(); + + checkBrokenImageMapReferences(); + + checkDuplicateMapNames(); + + checkDanglingMaps(); + + checkEmptyMaps(); + + checkForBrokenHrefLinks();// the major check + + return getCheckingResults(); + } + + private void checkDanglingMaps() { + mapNames.stream() + .filter(n -> !usemapRefs.contains(n)) + .map(mapName -> "ImageMap \"" + mapName + "\" not referenced by any image.") + .forEach(findingText -> getCheckingResults().addFinding(new Finding(findingText))); + } + + private void checkEmptyMaps() { + mapNames.stream().map(mapName -> + pageToCheck.getAllAreasForMapName(mapName)) + .filter(areas -> !areas.isEmpty()) + .peek(a -> getCheckingResults().incNrOfChecks()) + .forEach(area -> getCheckingResults().addFinding(new Finding(findingText))); + } + + /* +check for duplicate map names + */ + private void checkDuplicateMapNames() { + int mapNameCount; + + Set mapNameSet = new HashSet<>(mapNames); + + mapNameSet.stream() + .peek(a -> getCheckingResults().incNrOfChecks()) + .filter(name -> mapNames.stream().filter(name2 -> name2.equals(name)).count() > 1) + .forEach(mapName -> + getCheckingResults().addFinding( + new Finding(mapNames.stream().filter(name2 -> name2.equals(mapName)).count() + " imagemaps with identical name \"" + mapName + "\" exist."))); + + + } + + /* + * ... + * a.) if there is no map named "y" -> problem + * b.) if there are more maps named "y" -> problem + */ + private void checkBrokenImageMapReferences() { + imagesWithUsemapRefs.stream() + .forEach(imageTag -> checkBrokenImageMapReference(imageTag.getUsemapRef(), imageTag)); + } + + private void checkBrokenImageMapReference(String imgMap, HtmlElement imageTag) { + getCheckingResults().incNrOfChecks(); + + + long mapCount = mapNames.stream().filter(it -> it == imgMap).count(); + + if (mapCount == 0L) { + // no map found, despite img-tag usemap-reference + findingText = "ImageMap \"" + imageTag.getUsemapRef() + "\" (referenced by image \"" + imageTag.getImageSrcAttribute() + "\") missing."; + getCheckingResults().addFinding(new Finding(findingText)); + } + } + + private void checkForBrokenHrefLinks() { + + mapNames.forEach(n -> checkAreaHrefsForMapName(n)); + } + + /* + for a specific mapName, check all its contained areaHrefs + */ + private void checkAreaHrefsForMapName(String mapName) { + List areaHrefs = pageToCheck.getAllHrefsForMapName(mapName); + + // if this List is empty -> the map is empty + // TODO replace checkEmptyMaps with additional check here + + areaHrefs.stream() + .peek(a -> getCheckingResults().incNrOfChecks()) + .filter(href -> Web.isCrossReference(href)) + .forEach(href -> checkLocalHref(href, mapName, areaHrefs)); + + } + + /* +check if href has valid local target +TODO: currently restricted to LOCAL references +TODO: remove duplication to BrokenCrossReferencesChecker +*/ + private void checkLocalHref(String href, String mapName, List areaHrefs) { + // strip href of its leading "#" + String linkTarget = (href.startsWith("#")) ? href.substring(1) : href; + + + if (!listOfIds.contains(linkTarget)) { + + // we found a broken link! + findingText = "ImageMap \"" + mapName + "\" refers to missing link \"" + linkTarget + "\""; + + // now count occurrences - how often is it referenced + int nrOfReferences = (int) areaHrefs.stream().filter(it -> it == href).count(); + if (nrOfReferences > 1) { + findingText += ", reference count: " + nrOfReferences + "."; + } else findingText += "."; + + getCheckingResults().newFinding(findingText, nrOfReferences); + } + + } + + private void readImageMapAttributesFromHtml() { + // get all + imagesWithUsemapRefs = pageToCheck.getImagesWithUsemapDeclaration(); + + // get all ... + maps = pageToCheck.getAllImageMaps(); + + // get the names of all maps + mapNames = pageToCheck.getAllMapNames(); + + // get all referenced maps from image tags with usemap-attribute + usemapRefs = pageToCheck.getAllUsemapRefs(); + + // list of all id="XYZ" + listOfIds = pageToCheck.getAllIdStrings(); + + } +} diff --git a/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/check/MissingAltInImageTagsChecker.java b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/check/MissingAltInImageTagsChecker.java new file mode 100644 index 00000000..26b8dd81 --- /dev/null +++ b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/check/MissingAltInImageTagsChecker.java @@ -0,0 +1,48 @@ +package org.aim42.htmlsanitycheck.check; + +import org.aim42.htmlsanitycheck.Configuration; +import org.aim42.htmlsanitycheck.collect.SingleCheckResults; +import org.aim42.htmlsanitycheck.html.HtmlElement; +import org.aim42.htmlsanitycheck.html.HtmlPage; + +/** + * checks for missing or empty alt-attributes in image tags. + */ +public class MissingAltInImageTagsChecker extends Checker { + public MissingAltInImageTagsChecker(Configuration pConfig) { + super(pConfig); + } + + @Override + protected void initCheckingResultsDescription() { + getCheckingResults().setWhatIsChecked("Missing alt-attribute declaration in image tags"); + getCheckingResults().setSourceItemName("image tags"); + getCheckingResults().setTargetItemName("missing alt attributes"); + } + + + @Override + protected SingleCheckResults check(final HtmlPage pageToCheck) { + // the number of checks is calculated by counting + // ALL image tags: + getCheckingResults().setNrOfChecks(pageToCheck.getAllImageTags().size()); + + // see HtmlPageSpec for behavior: missing or empty alt-attributes are included... + pageToCheck.getAllImageTagsWithMissingAltAttribute().stream() + .forEach(element -> reportSingleImageTagWithMissingAlt(element)); + + + return getCheckingResults(); + } + + private void reportSingleImageTagWithMissingAlt(HtmlElement element) { + + String imageName = element.getImageSrcAttribute(); + + String findingText = "image \"" + imageName + "\" is missing alt-attribute"; + + getCheckingResults().newFinding(findingText); + + } + +} diff --git a/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/check/MissingImageFilesChecker.java b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/check/MissingImageFilesChecker.java new file mode 100644 index 00000000..a9b711f1 --- /dev/null +++ b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/check/MissingImageFilesChecker.java @@ -0,0 +1,128 @@ +package org.aim42.htmlsanitycheck.check; + +import org.aim42.htmlsanitycheck.Configuration; +import org.aim42.htmlsanitycheck.collect.SingleCheckResults; +import org.aim42.htmlsanitycheck.html.HtmlElement; +import org.aim42.htmlsanitycheck.html.HtmlPage; +import org.aim42.htmlsanitycheck.tools.Web; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.UnsupportedEncodingException; +import java.net.URLDecoder; +import java.util.List; +import java.util.regex.Pattern; + +public class MissingImageFilesChecker extends Checker { + + private static final Logger logger = LoggerFactory.getLogger(MissingImageFilesChecker.class); + private List images; + private final File baseDir; + private File currentDir; + + public MissingImageFilesChecker(Configuration pConfig) { + super(pConfig); + baseDir = ((File) (getMyConfig().getConfigItemByName(Configuration.getITEM_NAME_sourceDir()))); + } + + @Override + protected void initCheckingResultsDescription() { + getCheckingResults().setWhatIsChecked("Missing Local Images Check"); + getCheckingResults().setSourceItemName("img src attributes"); + getCheckingResults().setTargetItemName("missing image files"); + } + + @Override + protected SingleCheckResults check(final HtmlPage pageToCheck) { + final File file1 = pageToCheck.getFile(); + final File file = (file1 == null ? null : file1.getParentFile()); + currentDir = file != null ? file : baseDir; + + //get list of all image-tags " checkSingleLocalImage(image)); + + } + + private void checkSingleLocalImage(HtmlElement image) { + String imageSrcAttribute = image.getImageSrcAttribute(); + + // check only "local" image references + // (that is, NO remote URL) + boolean isRemoteURL = Web.isRemoteURL(imageSrcAttribute); + boolean isDataURI = Web.isDataURI(imageSrcAttribute); + if (isRemoteURL) { + //do nothing. This checks for _local_ images + } else if (isDataURI) { + // bookkeeping: + getCheckingResults().incNrOfChecks(); + + doesDataURIContainData(imageSrcAttribute); + + } else { + //we have a simple local image + + // bookkeeping: + getCheckingResults().incNrOfChecks(); + + doesImageFileExist(imageSrcAttribute); + } + + } + + /** + * check if a single image file exists + * + * @param relativePathToImageFile == XYZ in + **/ + private void doesImageFileExist(String relativePathToImageFile) { + File parentDir = relativePathToImageFile.startsWith("/") ? baseDir : currentDir; + + String decodedRelativePathtoImageFile = null; + try { + decodedRelativePathtoImageFile = URLDecoder.decode(relativePathToImageFile,"UTF-8"); + } catch (UnsupportedEncodingException e) { + throw new RuntimeException(e); + } + + File imageFile = new File(parentDir, decodedRelativePathtoImageFile); + + if (!imageFile.exists() || imageFile.isDirectory()) { + String findingText = "image \"" + relativePathToImageFile + "\" missing"; + getCheckingResults().newFinding(findingText); + } + + + } + + /** + * check if the given data-URI contains actual data + *

+ * Good: "..." + *

+ * Bad: "data:image/jpg;base64," + * + * @param dataURI == XYZ in + **/ + private void doesDataURIContainData(String dataURI) { + // let's do a simple regexp + + if (Pattern.matches("^data:image/[a-z]+;base64,", dataURI)) { + String findingText = "data-URI image missing"; + getCheckingResults().newFinding(findingText); + } + + } +} diff --git a/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/check/MissingLocalResourcesChecker.java b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/check/MissingLocalResourcesChecker.java new file mode 100644 index 00000000..75ef6f08 --- /dev/null +++ b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/check/MissingLocalResourcesChecker.java @@ -0,0 +1,123 @@ +package org.aim42.htmlsanitycheck.check; + +import org.aim42.htmlsanitycheck.Configuration; +import org.aim42.htmlsanitycheck.collect.SingleCheckResults; +import org.aim42.htmlsanitycheck.html.HtmlPage; +import org.aim42.htmlsanitycheck.tools.Web; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.net.URI; +import java.net.URISyntaxException; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +public class MissingLocalResourcesChecker extends Checker { + public static final String MLRC_MESSAGE_PREFIX = "local resource"; + public static final String MLRC_MESSAGE_MISSING = "missing"; + public static final String MLRC_REFCOUNT = ", reference count: "; + private static final Logger logger = LoggerFactory.getLogger(MissingLocalResourcesChecker.class); + private Set localResourcesSet; + /** + * The base directory to resolve absolute paths. + */ + private final File baseDir; + /** + * The current directory, obtained from the HtmlPage, to resolve + * relative paths. + */ + private File currentDir; + /** + * True to require files to be referenced and not directories. Useful if the web server doesn't + * support a default directory, such as Amazon S3. + */ + private final boolean requireFiles = false; + public MissingLocalResourcesChecker(Configuration pConfig) { + super(pConfig); + baseDir = ((File) (pConfig.getConfigItemByName(Configuration.getITEM_NAME_sourceDir()))); + } + + @Override + protected void initCheckingResultsDescription() { + getCheckingResults().setWhatIsChecked("Missing Local Resources Check"); + getCheckingResults().setSourceItemName("anchor tag href attribute"); + getCheckingResults().setTargetItemName("missing local resources"); + } + + @Override + protected SingleCheckResults check(final HtmlPage pageToCheck) { + //get list of all anchor-tags containing href="xyz" in html file + List allHrefs = pageToCheck.getAllHrefStrings(); + + // now filter out all local resources + + // now filter out all local resources + localResourcesSet = allHrefs.stream().filter(Web::isLocalResource).collect(Collectors.toSet()); + + logger.debug("local resources set: " + localResourcesSet); + + final File file1 = pageToCheck.getFile(); + final File file = (file1 == null ? null : file1.getParentFile()); + currentDir = file != null ? file : baseDir; + + // perform the actual checks + checkAllLocalResources(localResourcesSet); + + return getCheckingResults(); + + } + + private void checkAllLocalResources(Set localResources) { + + localResources.forEach(localResource -> checkSingleLocalResource(localResource)); + } + + private void checkSingleLocalResource(String localResource) { + // the localResource is either path+filename or filename or directory + + logger.debug("single resource to be checked: + " + localResource); + + // bookkeeping: + getCheckingResults().incNrOfChecks(); + + // we need to strip the localResource of #anchor-parts + String localResourcePath = null; + try { + localResourcePath = new URI(localResource).getPath(); + } catch (URISyntaxException e) { + throw new RuntimeException(e); + } + + if (localResourcePath == null) { + // For example, javascript:; + return; + + } + + + File parentDir = localResourcePath.startsWith("/") ? baseDir : currentDir; + + // we need the baseDir for robust checking of local resources... + File localFile = new File(parentDir, localResourcePath); + + // action required if resource does not exist + if (!localFile.exists() || !localFile.isFile()) { + handleNonexistingLocalResource(localResource); + } + + } + + private void handleNonexistingLocalResource(final String nonExistingLocalResource) { + String findingText = MLRC_MESSAGE_PREFIX + " \"" + nonExistingLocalResource + "\" " + MLRC_MESSAGE_MISSING; + + // how often is localResource referenced? + int nrOfOccurrences = (int) localResourcesSet.stream().filter(et -> et.equals(nonExistingLocalResource)).count(); + + if (nrOfOccurrences > 1) findingText += MLRC_REFCOUNT + nrOfOccurrences; + + // add Finding to our current checking results, increment nrOfFindings by nrOfOccurrences + getCheckingResults().newFinding(findingText, nrOfOccurrences); + } +} diff --git a/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/check/SuggestingChecker.java b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/check/SuggestingChecker.java new file mode 100644 index 00000000..6f923883 --- /dev/null +++ b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/check/SuggestingChecker.java @@ -0,0 +1,79 @@ +package org.aim42.htmlsanitycheck.check; + +import org.aim42.htmlsanitycheck.Configuration; +import org.aim42.htmlsanitycheck.collect.Finding; +import org.aim42.htmlsanitycheck.collect.SingleCheckResults; +import org.aim42.htmlsanitycheck.html.HtmlPage; +import org.aim42.htmlsanitycheck.suggest.Suggester; + +import java.util.List; + +/** + * Abstract class for those @see Checker subclasses that + * can propose suggestions, not only identify errors. + * Example: MissingImagesChecker might suggest names of existing images + * that "could have been meant" + */ +public abstract class SuggestingChecker extends Checker { + private List validPossibilities; + + public SuggestingChecker(Configuration pConfig) { + super(pConfig); + } + + @Override + protected abstract void initCheckingResultsDescription(); + + /** + * let the instance determine the list of possible values + * Examples: + * - MissingImageFilesChecker -> collect the names of images files + * - BrokenCrossReferencesChecker -> collect all (internal) link targets + **/ + protected abstract void setValidPossibilities(); + + @Override + protected abstract SingleCheckResults check(final HtmlPage pageToCheck); + + /** + * a little tricky: call performCheck on the superclass and add a little behavior :-) + * it's a Template-Method again. + * + * @return List of Findings (SingleCheckResults), but with suggestions for each finding + */ + @Override + public final SingleCheckResults performCheck(HtmlPage pageToCheck) { + SingleCheckResults scResults = super.performCheck(pageToCheck); + + setValidPossibilities(); + + determinSuggestionsForEveryFinding(); + + return scResults; + } + + /** + * determines suggestions for every Finding agains the list + * of valid possibilities + */ + public void determinSuggestionsForEveryFinding() { + getCheckingResults().getFindings().stream().forEach(finding -> + determineSuggestionsForSingleFinding(finding)); + } + + /** + * + */ + public void determineSuggestionsForSingleFinding(Finding finding) { + finding.setSuggestions(Suggester.determineNSuggestions(finding.getWhatIsTheProblem(), validPossibilities, 1)); + + } + + public List getValidPossibilities() { + return validPossibilities; + } + + public void setValidPossibilities(List validPossibilities) { + this.validPossibilities = validPossibilities; + } +} diff --git a/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/check/UnknownCheckerException.java b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/check/UnknownCheckerException.java new file mode 100644 index 00000000..7fbc2125 --- /dev/null +++ b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/check/UnknownCheckerException.java @@ -0,0 +1,11 @@ +package org.aim42.htmlsanitycheck.check; + +public class UnknownCheckerException extends RuntimeException { + public UnknownCheckerException(String message) { + super(message); + } + + public UnknownCheckerException(String message, String checkerName) { + super(message + ": " + checkerName); + } +} diff --git a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/collect/CheckResults.groovy b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/collect/CheckResults.java similarity index 89% rename from htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/collect/CheckResults.groovy rename to htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/collect/CheckResults.java index 3cbf9f44..6c51d6b6 100644 --- a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/collect/CheckResults.groovy +++ b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/collect/CheckResults.java @@ -1,4 +1,6 @@ -package org.aim42.htmlsanitycheck.collect +package org.aim42.htmlsanitycheck.collect; + +import java.util.List; /************************************************************************ * This is free software - without ANY guarantee! @@ -25,9 +27,9 @@ public interface CheckResults { // return a description of what is checked // (e.g. "Missing Images Checker" or "Broken Cross-References Checker" - public String description() + String description(); // returns all findings/problems found during this check - public ArrayList getFindings() + List getFindings(); } // end::CheckResultsInterface[] diff --git a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/collect/Finding.groovy b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/collect/Finding.java similarity index 57% rename from htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/collect/Finding.groovy rename to htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/collect/Finding.java index d3219c5b..758da77c 100644 --- a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/collect/Finding.groovy +++ b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/collect/Finding.java @@ -1,6 +1,10 @@ // see end-of-file for license information -package org.aim42.htmlsanitycheck.collect +package org.aim42.htmlsanitycheck.collect; + +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; /** * A single "finding" from any check, i.e.: @@ -8,78 +12,90 @@ * - a missing label/id/bookmark (== broken link) * - a duplicate label/id/bookmark */ -class Finding { - - String whatIsTheProblem // i.e. which image is missing, which link/anchor is undefined - - int nrOfOccurrences // how often does this specific finding occur in the checked-page +public class Finding { + String whatIsTheProblem; // i.e. which image is missing, which link/anchor is undefined + int nrOfOccurrences;// how often does this specific finding occur in the checked-page // suggestions are ordered: getAt(0) yields the best, getAt(1) the second and so forth - ArrayList suggestions - + List suggestions; public Finding() { - this( "" ) + this(""); } /** * no finding should exist without giving an explanation ("whatIsTheProblem") * about what went wrong. + * * @param whatIsTheProblem An explanation of what went wrong (i.e. name of missing file) */ public Finding(String whatIsTheProblem) { - this( whatIsTheProblem, 1, new ArrayList(3)) + this(whatIsTheProblem, 1, new ArrayList(3)); } - /** * finding with explanation and several occurences */ - public Finding(String whatIsTheProblem, int nrOfOccurrences ) { - this( whatIsTheProblem, nrOfOccurrences, new ArrayList(3)) + public Finding(String whatIsTheProblem, int nrOfOccurrences) { + this(whatIsTheProblem, nrOfOccurrences, new ArrayList(3)); } /** * most general constructor: * create Finding with explanation and nrOfOccurrences - * @param whatIsTheProblem An explanation of what went wrong (i.e. name of missing file) - * */ + * + * @param whatIsTheProblem An explanation of what went wrong (i.e. name of missing file) + */ public Finding(String whatIsTheProblem, int nrOfOccurrences, ArrayList suggestions) { - this.whatIsTheProblem = whatIsTheProblem - this.nrOfOccurrences = nrOfOccurrences - this.suggestions = suggestions + this.whatIsTheProblem = whatIsTheProblem; + this.nrOfOccurrences = nrOfOccurrences; + this.suggestions = suggestions; } /** * create Finding with explanation and suggestions + * * @param whatIsTheProblem explanation what went wrong - * @param suggestions what could have been meant + * @param suggestions what could have been meant */ public Finding(String whatIsTheProblem, ArrayList suggestions) { - this( whatIsTheProblem, 1, suggestions) + this(whatIsTheProblem, 1, suggestions); + } + + public String getWhatIsTheProblem() { + return whatIsTheProblem; + } + + public int getNrOfOccurrences() { + return nrOfOccurrences; + } + + public void setSuggestions(List suggestions) { + this.suggestions = suggestions; } /** * add a single suggestion to the list of suggestions + * * @param suggestion */ public void addSingleSuggestion(String suggestion) { - suggestions.add(suggestion) + suggestions.add(suggestion); } - public void setNrOfOccurences( int nrOfOccurences ) { - this.nrOfOccurrences = nrOfOccurrences + public void setNrOfOccurences(int nrOfOccurences) { + this.nrOfOccurrences = nrOfOccurrences; } @Override public String toString() { - String refCount = (nrOfOccurrences > 1) ? " (reference count: $nrOfOccurrences)": "" - String suggestionStr = (suggestions.size() > 0) ? " (Suggestions: " + suggestions.join(", ") + ")": "" + String refCount = (nrOfOccurrences > 1) ? " (reference count: $nrOfOccurrences)" : ""; + String suggestionStr = (suggestions.size() > 0) ? "\n (Suggestions: " + suggestions.stream().collect(Collectors.joining(",")) + ")" : ""; - return whatIsTheProblem + refCount + (suggestionStr ? "\n" + suggestionStr : "") + return whatIsTheProblem + refCount + (suggestionStr.isEmpty() ? "" : suggestionStr); } diff --git a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/collect/PageResults.groovy b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/collect/PageResults.java similarity index 81% rename from htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/collect/PageResults.groovy rename to htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/collect/PageResults.java index c843b1c5..f8686367 100644 --- a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/collect/PageResults.groovy +++ b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/collect/PageResults.java @@ -1,4 +1,4 @@ -package org.aim42.htmlsanitycheck.collect +package org.aim42.htmlsanitycheck.collect; /************************************************************************ * This is free software - without ANY guarantee! @@ -24,19 +24,20 @@ public interface PageResults { // what's the title of this page? - public String getPageTitle() + String getPageTitle(); // what's the filename and path? - public String getPageFileName() - public String getPageFilePath() + String getPageFileName(); + + String getPageFilePath(); // how many items have been checked? - public int nrOfItemsCheckedOnPage() + int nrOfItemsCheckedOnPage(); // how many problems were found on this page? - public int nrOfFindingsOnPage() + int nrOfFindingsOnPage(); // how many different checks have run on this page? - public int howManyCheckersHaveRun() + int howManyCheckersHaveRun(); } // end::PageResultInterface[] diff --git a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/collect/PerRunResults.groovy b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/collect/PerRunResults.java similarity index 67% rename from htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/collect/PerRunResults.groovy rename to htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/collect/PerRunResults.java index 1db7eee1..313f227b 100644 --- a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/collect/PerRunResults.groovy +++ b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/collect/PerRunResults.java @@ -1,25 +1,23 @@ -package org.aim42.htmlsanitycheck.collect +package org.aim42.htmlsanitycheck.collect; + +import java.util.ArrayList; /** * Collects checking results of 1..n html files which are checked together in one "run". - * + *

* Can keep results spanning more than one file (e.g. unused-image-files). - * */ -class PerRunResults implements RunResults { - - private ArrayList resultsForAllPages +public class PerRunResults implements RunResults { + public final static Long ILLEGAL_TIMER = -7353315L; + // magic number - also used in tests + private final static Long TIMER_STILL_RUNNING = 42L; // unused images is the only check concerning all pages... - SingleCheckResults unusedImagesResultsCollector - + SingleCheckResults unusedImagesResultsCollector; + private final ArrayList resultsForAllPages; // checking time is important - therefore we maintain a timer - private Long startedCheckingTimeMillis - private Long finishedCheckingTimeMillis - - // magic number - also used in tests - private final static Long TIMER_STILL_RUNNING = 42 - public final static Long ILLEGAL_TIMER = -7353315 + private final Long startedCheckingTimeMillis; + private Long finishedCheckingTimeMillis; /** @@ -29,10 +27,10 @@ class PerRunResults implements RunResults { * + a simple timer to validate the checks ran fast enough */ public PerRunResults() { - this.startedCheckingTimeMillis = System.currentTimeMillis() - this.finishedCheckingTimeMillis = TIMER_STILL_RUNNING + this.startedCheckingTimeMillis = System.currentTimeMillis(); + this.finishedCheckingTimeMillis = TIMER_STILL_RUNNING; - this.resultsForAllPages = new ArrayList() + this.resultsForAllPages = new ArrayList(); } @@ -41,50 +39,49 @@ public PerRunResults() { */ @Override public ArrayList getResultsForAllPages() { - return resultsForAllPages + return resultsForAllPages; } /** * stop the checking timer */ public void stopTimer() { - finishedCheckingTimeMillis = System.currentTimeMillis() + finishedCheckingTimeMillis = System.currentTimeMillis(); } /** * query the timer - * if timer has not yet been stopped - return a crazy number + * if timer has not yet been stopped - return a crazy number */ @Override public Long checkingTookHowManyMillis() { - Long itTookSoLong + Long itTookSoLong; if (finishedCheckingTimeMillis == TIMER_STILL_RUNNING) - itTookSoLong = ILLEGAL_TIMER // if read upside down: "Sie Esel" + itTookSoLong = ILLEGAL_TIMER; // if read upside down: "Sie Esel" else - itTookSoLong = finishedCheckingTimeMillis - startedCheckingTimeMillis + itTookSoLong = finishedCheckingTimeMillis - startedCheckingTimeMillis; - return itTookSoLong + return itTookSoLong; } - /** * adds one kind of checking results. + * * @param pageResults : checking results for a single HTML page */ public void addPageResults(SinglePageResults pageResults) { - assert resultsForAllPages != null + assert resultsForAllPages != null; - resultsForAllPages.add(pageResults) + resultsForAllPages.add(pageResults); } /** - * * @return how many distinct CheckingResultCollectors have been added (so far)? */ @Override public int nrOfPagesChecked() { - return resultsForAllPages.size() + return resultsForAllPages.size(); } /** @@ -92,11 +89,12 @@ public int nrOfPagesChecked() { */ @Override public int nrOfChecksPerformedOnAllPages() { - int nrOfChecks = 0 - resultsForAllPages.each { singlePageResults -> - nrOfChecks += singlePageResults.nrOfItemsCheckedOnPage() - } - return nrOfChecks + + return resultsForAllPages.stream() + .map(singlePageResults -> singlePageResults.nrOfItemsCheckedOnPage()) + .reduce((a, b) -> a + b).orElseGet(() -> 0); + + } /** @@ -104,11 +102,10 @@ public int nrOfChecksPerformedOnAllPages() { */ @Override public int nrOfFindingsOnAllPages() { - int totalNrOfFindings = 0 - resultsForAllPages.each { pageResult -> - totalNrOfFindings += pageResult.nrOfFindingsOnPage() - } - return totalNrOfFindings + + return resultsForAllPages.stream() + .map(singlePageResults -> singlePageResults.nrOfFindingsOnPage()) + .reduce((a, b) -> a + b).orElseGet(() -> 0); } diff --git a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/collect/RunResults.groovy b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/collect/RunResults.java similarity index 81% rename from htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/collect/RunResults.groovy rename to htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/collect/RunResults.java index 0bc26d8d..540394fa 100644 --- a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/collect/RunResults.groovy +++ b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/collect/RunResults.java @@ -1,4 +1,6 @@ -package org.aim42.htmlsanitycheck.collect +package org.aim42.htmlsanitycheck.collect; + +import java.util.List; /************************************************************************ * This is free software - without ANY guarantee! @@ -24,18 +26,18 @@ public interface RunResults { // returns results for all pages which have been checked - public ArrayList getResultsForAllPages() + List getResultsForAllPages(); // how many pages were checked in this run? - public int nrOfPagesChecked() + int nrOfPagesChecked(); // how many checks were performed in all? - public int nrOfChecksPerformedOnAllPages() + int nrOfChecksPerformedOnAllPages(); // how many findings (errors and issues) were found in all? - public int nrOfFindingsOnAllPages() + int nrOfFindingsOnAllPages(); // how long took checking (in milliseconds)? - public Long checkingTookHowManyMillis() + Long checkingTookHowManyMillis(); } // end::RunResultInterface[] \ No newline at end of file diff --git a/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/collect/SingleCheckResults.java b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/collect/SingleCheckResults.java new file mode 100644 index 00000000..8cb4bd49 --- /dev/null +++ b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/collect/SingleCheckResults.java @@ -0,0 +1,207 @@ +// see end-of-file for license information + +package org.aim42.htmlsanitycheck.collect; + +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; + +/** + * collects results for a specific type of @see Checker + * (i.e. missing images, broken cross-references). + * + * @author Gernot Starke + */ + +public class SingleCheckResults implements CheckResults { + + // the actual findings + public ArrayList findings; + String whatIsChecked; // i.e. "Missing Local Images Check" + // source-whatIsTheProblem is checked against target-whatIsTheProblem + String sourceItemName; // i.e. image-src-attribute, anchor/link + String targetItemName; // i.e. local-image-file, id/bookmark + String generalRemark; // i.e. "Internet not available" + int nrOfItemsChecked; + private int nrOfIssues; + + /** + * Initialize some members. + *

+ * Other members are set by the Checker-instance + * owning this SingleCheckResults. + */ + public SingleCheckResults() { + + this.nrOfItemsChecked = 0; + this.nrOfIssues = 0; + this.findings = new ArrayList(); + this.generalRemark = ""; + } + + public String getWhatIsChecked() { + return whatIsChecked; + } + + public void setWhatIsChecked(String whatIsChecked) { + this.whatIsChecked = whatIsChecked; + } + + public String getSourceItemName() { + return sourceItemName; + } + + public void setSourceItemName(String sourceItemName) { + this.sourceItemName = sourceItemName; + } + + public String getTargetItemName() { + return targetItemName; + } + + public void setTargetItemName(String targetItemName) { + this.targetItemName = targetItemName; + } + + public String getGeneralRemark() { + return generalRemark; + } + // nrOfIssues can be larger than findings.size(), + // if some findings occur more than once + + public void setGeneralRemark(String generalRemark) { + this.generalRemark = generalRemark; + } + + public int getNrOfItemsChecked() { + return nrOfItemsChecked; + } + + /** + * add a single finding to the collection, + * + * @param message: what kind of finding is it? + */ + public void newFinding(String message) { + addFinding(new Finding(message), 1); + } + + /** + * add a single finding to the collection, + * + * @param message: what kind of finding is it? + * @param nrOfOccurrences: how often does this occur? + */ + public void newFinding(String message, int nrOfOccurrences) { + addFinding(new Finding(message), nrOfOccurrences); + } + + + /** + * add a single finding to the collection of Finding instances + * + * @param singleFinding + */ + public void addFinding(Finding singleFinding) { + findings.add(singleFinding); + incNrOfIssues(); + } + + /** + * add single Finding with multiple occurrences + */ + public void addFinding(Finding singleFinding, int nrOfOccurrences) { + findings.add(singleFinding); + addNrOfIssues(nrOfOccurrences); + } + + /** + * bookkeeping on the number of checks + */ + public void incNrOfChecks() { + nrOfItemsChecked += 1; + } + + public void addNrOfChecks(int nrOfChecksToAdd) { + nrOfItemsChecked += nrOfChecksToAdd; + } + + public void setNrOfChecks(int nrOfChecks) { + nrOfItemsChecked = nrOfChecks; + } + + /** + * bookkeeping on the number of issues + */ + public void incNrOfIssues() { + nrOfIssues += 1; + } + + public void addNrOfIssues(int nrOfIssuesToAdd) { + nrOfIssues += nrOfIssuesToAdd; + } + + + /** + * @return a description of what is checked + */ + @Override + public String description() { + return whatIsChecked; + } + + + @Override + public ArrayList getFindings() { + return findings; + } + + /** + * return a collection of finding-messages + * (used to simplify testing) + */ + public List getFindingMessages() { + return findings.stream().map(finding -> finding.getWhatIsTheProblem()).collect(Collectors.toList()); + + } + + + /** + * @return (int) the nr of issues/findings found for this checkingResults. + */ + public int nrOfProblems() { + return nrOfIssues; + } + + + @Override + public String toString() { + int nrOfProblems = nrOfProblems(); + return "Checking results for $whatIsChecked" + '\n' + + " $nrOfItemsChecked $sourceItemName checked," + '\n' + + " $nrOfProblems finding(s)" + '\n' + + findings.stream().map(it -> it.toString()).collect(Collectors.joining("\n")); + + + } + + +} + +/*===================================================================== + Copyright Gernot Starke and aim42 contributors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an + "AS IS" BASIS,WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + =====================================================================*/ + diff --git a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/collect/SinglePageResults.groovy b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/collect/SinglePageResults.java similarity index 61% rename from htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/collect/SinglePageResults.groovy rename to htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/collect/SinglePageResults.java index ae8102c0..abc4f0a4 100644 --- a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/collect/SinglePageResults.groovy +++ b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/collect/SinglePageResults.java @@ -1,91 +1,86 @@ -package org.aim42.htmlsanitycheck.collect +package org.aim42.htmlsanitycheck.collect; + +import java.util.ArrayList; +import java.util.List; /** * Collects checking results {@link Finding} of a single html page. - * + *

* Contains information about the page itself, e.g. its filename and title. * Instances of this class will be contained in {@link SingleCheckResults} */ -class SinglePageResults implements PageResults { +public class SinglePageResults implements PageResults { - public String pageFileName // from where we read the HTML - public String pageFilePath // the complete path from where we read - public String pageTitle // as given in the attribute - public int pageSize // size in byte + public String pageFileName; // from where we read the HTML + public String pageFilePath; // the complete path from where we read + public String pageTitle; // as given in the <title> attribute + public int pageSize; // size in byte - public List<SingleCheckResults> singleCheckResults + public List<SingleCheckResults> singleCheckResults; // some variants for construction public SinglePageResults() { - this.singleCheckResults = new ArrayList<SingleCheckResults>() - this.pageFileName = "" - this.pageFilePath = "" - this.pageTitle = "" - this.pageSize = 0 + this.singleCheckResults = new ArrayList<SingleCheckResults>(); + this.pageFileName = ""; + this.pageFilePath = ""; + this.pageTitle = ""; + this.pageSize = 0; } - - public SinglePageResults( SingleCheckResults scr ) { - this() - this.singleCheckResults.add( scr ) + + public SinglePageResults(SingleCheckResults scr) { + this(); + this.singleCheckResults.add(scr); } /** - * allows to add the results of a single check - **/ + * allows to add the results of a single check + **/ public void addResultsForSingleCheck(SingleCheckResults resultsForSingleCheck) { - singleCheckResults.add(resultsForSingleCheck) + singleCheckResults.add(resultsForSingleCheck); } // overhead for Groovy code - but useful for Interface documentation @Override public String getPageTitle() { - return pageTitle + return pageTitle; } @Override public String getPageFileName() { - return pageFileName + return pageFileName; } @Override public String getPageFilePath() { - return pageFilePath + return pageFilePath; } // query the results @Override public int nrOfItemsCheckedOnPage() { - int nrOfItemsChecked = 0 - singleCheckResults.each { - nrOfItemsChecked += it.nrOfItemsChecked - } - return nrOfItemsChecked - + return singleCheckResults.stream().map(it -> it.getNrOfItemsChecked()).reduce(0, (a, b) -> a + b); } @Override public int nrOfFindingsOnPage() { - int nrOfFindings = 0 - singleCheckResults.each { - nrOfFindings += it.nrOfProblems() - } - return nrOfFindings + return singleCheckResults.stream().map(it -> it.nrOfProblems()).reduce(0, (a, b) -> a + b); } /** * returns the number of distinct checker types that have run * (by the number of SingleCheckResults available) + * * @return */ @Override public int howManyCheckersHaveRun() { - return singleCheckResults.size() + return singleCheckResults.size(); } } diff --git a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/html/HtmlConst.groovy b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/html/HtmlConst.java similarity index 87% rename from htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/html/HtmlConst.groovy rename to htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/html/HtmlConst.java index 86b1cef5..2216105f 100644 --- a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/html/HtmlConst.groovy +++ b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/html/HtmlConst.java @@ -1,9 +1,9 @@ -package org.aim42.htmlsanitycheck.html +package org.aim42.htmlsanitycheck.html; class HtmlConst { - public static final String HTML_HEAD = "<!DOCTYPE HTML> <html><head></head><body>" - public static final String HTML_END = "</body></html>" + public static final String HTML_HEAD = "<!DOCTYPE HTML> <html><head></head><body>"; + public static final String HTML_END = "</body></html>"; } diff --git a/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/html/HtmlElement.java b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/html/HtmlElement.java new file mode 100644 index 00000000..118e97ee --- /dev/null +++ b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/html/HtmlElement.java @@ -0,0 +1,92 @@ +package org.aim42.htmlsanitycheck.html; + +import org.jsoup.nodes.Element; + +// see end-of-file for license information + +/** + * Encapsulates a single HTML element with attributes + * Relies on jsoup.select.Element + */ +public class HtmlElement { + + private final Element element; + + HtmlElement(Element node) { + this.element = node; + } + + private static String normalizeHrefString(String href) { + // remove leading # + return href.startsWith("#") ? href.substring(1) : href; + } + + /** + * @return XYZ for img src="XYZ" tags + */ + public String getImageSrcAttribute() { + if (element.tagName().equals("img")) + return element.attr("src"); + else return ""; + } + + /** + * @return XYZ for <img src="..." alt="XYZ"> + */ + public String getImageAltAttribute() { + if (element.tagName().equals("img")) + return element.attr("alt"); + else return ""; + } + + /** + * @return XYZ for 'a href="XYZ"' tags + */ + public String getHrefAttribute() { + if (element.tagName().equals("a")) { + return element.attr("href"); + } else return ""; + } + + /** + * @return XYZ for 'id="XYZ"' attributes + */ + public String getIdAttribute() { + return element.attr("id"); + + } + + /** + * @return x for '<img src="y" usemap="x"> + */ + public String getUsemapRef() { + if (element.tagName().equals("img")) { + return normalizeHrefString(element.attr("usemap")); + } + return ""; + } + + public String getHref() { + return element.attributes().get("href"); + } + + public boolean hasImageAlt() { + return element.hasAttr("alt") && !element.attr("alt").isEmpty(); + } + + public String getImgSrc() { + return element.hasAttr("src") ? element.attr("src") : ""; + } + + public Element node() { + return element; + } + + @Override + public String toString() { + return element.toString(); + } +} + + + diff --git a/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/html/HtmlPage.java b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/html/HtmlPage.java new file mode 100644 index 00000000..3b4147b8 --- /dev/null +++ b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/html/HtmlPage.java @@ -0,0 +1,274 @@ +package org.aim42.htmlsanitycheck.html; + +import org.aim42.htmlsanitycheck.tools.Web; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.io.File; +import java.io.IOException; +import java.util.List; +import java.util.Set; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + + +// see end-of-file for license information + +/** + * Encapsulates a "real" html parser and provides + * convenience methods to access anchor and image links + * from html. + * <p> + * Relies on http://jsoup.org parser + */ +public class HtmlPage { + + /** + * Pattern to check for HTTP/S scheme, includes the + * scheme separator (colon). + */ + private static final Pattern HTTP_SCHEME_PATTERN = Pattern.compile("(?i)^https?:"); + + // jsoup Document + private final Document document; + + /** + * The HTML file. + */ + private File file; + + /** + * @param text html as text (string) + * @return an HtmlPage + */ + public HtmlPage(String text) { + // Jsoup promises to parse without exception - + // we believe it, as our wrapper is for checking + // purposes only + document = Jsoup.parse(text, "UTF-8"); + } + + /** + * @param file + * @return an HtmlPage + */ + public HtmlPage(File file) throws IOException { + assert file.exists(); + this.file = file; + document = Jsoup.parse(file, "UTF-8"); + } + + /** + * invokes the parser for the html page + * + * @param input file + */ + public static HtmlPage parseHtml(File fileToCheck) throws IOException { + assert fileToCheck.exists(); + return new HtmlPage(fileToCheck); + } + + /** + * Gets the file of the HTML page. + * + * @return the file, or null if the HTML is not from a file. + */ + public File getFile() { + return file; + } + + /** + * get document meta info (e.g. filename, title, size etc.) + */ + public int getDocumentSize() { + return document.toString().length(); + } + + public String getDocumentTitle() { + return document.title(); + } + + public String getDocumentURL() { + return document.nodeName(); + } + + public String getDocument() { + return document.toString(); + } + + /** + * builds a list of all imageMaps + * + * @return ArrayList of imageMaps + */ + public final List<HtmlElement> getAllImageMaps() { + return document.select("map").stream() + .map(HtmlElement::new) + .collect(Collectors.toList()); + } + + /** + * @return list of all imageMap-names + */ + public final List<String> getAllMapNames() { + return document.select("map").stream() + .map(m -> m.attr("name")) + .collect(Collectors.toList()); + } + + /** + * @return list of all usemap-references y with <img src="x" usemap="y" + */ + public List<String> getAllUsemapRefs() { + return getImagesWithUsemapDeclaration().stream() + .map(HtmlElement::getUsemapRef) + .collect(Collectors.toList()); + } + + /** + * builds a list from all '<img src="XYZ"/>' tags + * + * @return immutable ArrayList + */ + public final List<HtmlElement> getAllImageTags() { + return document.getElementsByTag("img").stream() + .map(HtmlElement::new) + .collect(Collectors.toList()); + } + + /** + * builds an immutable list of '<img src="xxx" alt="yz">, + * where "yz" is non-empty. + */ + public final List<HtmlElement> getAllImageTagsWithNonEmptyAltAttribute() { + return document.select("img[alt~=(\\S)]").stream() + .map(HtmlElement::new) + .collect(Collectors.toList()); + } + + /** + * builds an immutable list of <img...> tags, where + * the alt-tag is missing or empty (""). + */ + public final List<HtmlElement> getAllImageTagsWithMissingAltAttribute() { + return document.select("img[alt~=(\\S)]").stream() + .map(HtmlElement::new) + .collect(Collectors.toList()); + } + + /** + * builds a list of all '<a href="XYZ"> tags + * + * @return ArrayList of all hrefs, including the "#" + */ + public List<HtmlElement> getAllAnchorHrefs() { + return document.getElementsByAttribute("href").stream() + .map(HtmlElement::new) + .collect(Collectors.toList()); + } + + /** + * builds a list of all 'id="XYZ"' attributes + * + * @return ArrayList of all hrefs + */ + public final List<HtmlElement> getAllIds() { + return document.getElementsByAttribute("id").stream() + .map(HtmlElement::new) + .collect(Collectors.toList()); + } + + /** + * @return ArrayList < String > of all href-attributes + * <p> + * common pitfalls with hrefs: + * - local hrefs start with # (like "#appendix") + * - remote hrefs should be valid URLs (like "https://google.com") + * - remote hrefs might start with other than http (e.g. https, mailto, telnet, ssh) + * - hrefs might start with file:// + * - href might be empty string (nobody knows wtf this is good for, but html parsers usually accept it) + */ + public final List<String> getAllHrefStrings() { + return document.select("a[href]").stream() + .map(m -> m.attr("href")) + .collect(Collectors.toList()); + } + + /** + * @return immutable set of all href-attributes that start with http or https + */ + public final Set<String> getAllHttpHrefStringsAsSet() { + return document.select("a[href]") + .stream() + .filter(e -> e.hasAttr("href")) + .map(e -> e.attr("href")) + .filter(Web::isWebUrl) + .collect(Collectors.toSet()); + } + + /** + * @return immutable List of img-tags with "usemap=xyz" declaration + */ + public List<HtmlElement> getImagesWithUsemapDeclaration() { + return document.select("img[usemap]") + .stream().map(HtmlElement::new) + .collect(Collectors.toList()); + } + + /** + * html-map has the following form: + * <map name="mapName"><area...><area...></map> + * <p> + * collect all area elements for a given map. + * If more than one map exists with this name, areas + * for all maps are combined into one. + * + * @param mapName name of the map + * @return + */ + public List<Elements> getAllAreasForMapName(String mapName) { + return document.select("map[name=" + mapName + "]").stream() + .map(m -> m.children().select("area")) + .collect(Collectors.toList()); + + } + + + public List<String> getAllHrefsForMapName(String mapName) { + return getAllAreasForMapName(mapName).stream() + .map(a -> a.attr("href")) + .collect(Collectors.toList()); + } + + /** + * getAllIdStrings return ArrayList<String> of all id="xyz" definitions + */ + + public List<String> getAllIdStrings() { + return document.getAllElements().stream() + .map(Element::id) + .filter(s -> !s.isEmpty()) + .collect(Collectors.toList()); + } + + +} +/*======================================================================== + Copyright Gernot Starke and aim42 contributors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an + "AS IS" BASIS,WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + ========================================================================*/ + diff --git a/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/report/CreateLinkUtil.java b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/report/CreateLinkUtil.java new file mode 100644 index 00000000..7934ec47 --- /dev/null +++ b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/report/CreateLinkUtil.java @@ -0,0 +1,17 @@ +package org.aim42.htmlsanitycheck.report; + +/** + * trivial class to convert filenames to html link targets. + * E.g. the string "/dir/onefile.html" can be converted + * to "XdirXonefileXhtml" or similar. + */ +public class CreateLinkUtil { + public static String convertToLink(String stringWithNonWordChars) { + + // \W is regex for all non-word characters + String regex = "\\W "; + + return stringWithNonWordChars.replaceAll(regex, "X"); + } + +} diff --git a/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/report/Reporter.java b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/report/Reporter.java new file mode 100644 index 00000000..0d006224 --- /dev/null +++ b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/report/Reporter.java @@ -0,0 +1,119 @@ +package org.aim42.htmlsanitycheck.report; + +import org.aim42.htmlsanitycheck.ProductVersion; +import org.aim42.htmlsanitycheck.collect.PerRunResults; +import org.aim42.htmlsanitycheck.collect.SingleCheckResults; +import org.aim42.htmlsanitycheck.collect.SinglePageResults; + + +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.Date; +import java.util.List; +import java.util.stream.Collectors; + +/** + * superclass for reporting results. + * Subclasses will define the concrete output format + */ +public abstract class Reporter { + /** + * create the reporter + */ + public Reporter() { + this.createdOnDate = new SimpleDateFormat("dd. MMMM YYYY, HH:mm").format(new Date()); + this.createdByHSCVersion = ProductVersion.getVersion(); + + } + + /** + * Usually a Reporter instance shall be constructed with its appropriate + * + * @param runResults + * @see PerRunResults, as the latter contains all findings. + */ + public Reporter(PerRunResults runResults) { + this(); + this.runResults = runResults; + this.pageResults = runResults.getResultsForAllPages(); + + } + + /** + * add checking results for one page + */ + public List<SinglePageResults> addCheckingResultsForOnePage(SinglePageResults singlePageResults) { + pageResults.add(singlePageResults); + return pageResults.stream().sorted(Comparator.comparing(a -> a.pageTitle)).collect(Collectors.toList());// enforce sorting, fixing issue #128 // Todo: XCheck if issues is solved after migration to java + } + + /** + * main entry point for reporting - to be called when a report is requested + * <p> + * Uses template-method to delegate most concrete implementations to subclasses + */ + public void reportFindings() { + + initReport(); + + reportOverallSummary(); + + reportAllPages(); + + closeReport(); + } + + private void reportAllPages() { + + for (SinglePageResults pageResult : pageResults) { + reportPageSummary(pageResult);// delegated to subclass + reportPageDetails(pageResult);// implemented below + reportPageFooter();// delegated to subclass + } + + } + + protected void reportPageDetails(SinglePageResults pageResults) { + for (SingleCheckResults resultForOneCheck : pageResults.singleCheckResults) { + reportSingleCheckSummary((SingleCheckResults) resultForOneCheck); + reportSingleCheckDetails((SingleCheckResults) resultForOneCheck); + } + + } + + protected int totalNrOfPages() { + return pageResults.size(); + } + + protected int totalNrOfChecks() { + return runResults.nrOfChecksPerformedOnAllPages(); + } + + protected int totalNrOfFindings() { + return runResults.nrOfFindingsOnAllPages(); + } + + protected void initReport() { + // default: do nothing + } + + protected abstract void reportOverallSummary(); + + protected abstract void reportPageSummary(SinglePageResults pageResult); + + protected abstract void reportPageFooter(); + + protected abstract void reportSingleCheckSummary(SingleCheckResults singleCheckResults); + + protected abstract void reportSingleCheckDetails(SingleCheckResults singleCheckResults); + + protected void closeReport() { + // default: do nothing + } + + protected ArrayList<SinglePageResults> pageResults; + protected PerRunResults runResults; + protected final String createdOnDate; + protected final String createdByHSCVersion; +} diff --git a/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/report/SummarizerUtil.java b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/report/SummarizerUtil.java new file mode 100644 index 00000000..a18df73f --- /dev/null +++ b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/report/SummarizerUtil.java @@ -0,0 +1,38 @@ +package org.aim42.htmlsanitycheck.report; + +public class SummarizerUtil { + /** + * returns the percentage of successful checks. + * <p> + * Edge case: + * 0 checks -> 100% successful + */ + public static int percentSuccessful(int totalChecks, int totalNrOfFindings) { + + + // base case: if no checks performed, 100% successful + if (totalChecks <= 0) { + return 100; + } else { + + return 100 - Math.round((100f * totalNrOfFindings) / (float)totalChecks); + } + + } + + /** + * rounds one down to at most 3 digits with two decimalplaces, + * e.g. from + * 33450 to 33.45, from 1_234_566 to 1.23 + */ + public static float threeDigitTwoDecimalPlaces(int bigNumber) { + + + if (bigNumber >= 1_000_000) + return Math.round(bigNumber/ 10000f) / 100f; + else + return Math.round(bigNumber/ 10f) / 100f; + + } + +} diff --git a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/suggest/Suggester.groovy b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/suggest/Suggester.java similarity index 73% rename from htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/suggest/Suggester.groovy rename to htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/suggest/Suggester.java index 10ca9534..2485111b 100644 --- a/htmlSanityCheck-core/src/main/groovy/org/aim42/htmlsanitycheck/suggest/Suggester.groovy +++ b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/suggest/Suggester.java @@ -1,8 +1,13 @@ -package org.aim42.htmlsanitycheck.suggest +package org.aim42.htmlsanitycheck.suggest; -import net.ricecode.similarity.JaroWinklerStrategy -import net.ricecode.similarity.StringSimilarityService -import net.ricecode.similarity.StringSimilarityServiceImpl +import net.ricecode.similarity.JaroWinklerStrategy; +import net.ricecode.similarity.SimilarityScore; +import net.ricecode.similarity.StringSimilarityService; +import net.ricecode.similarity.StringSimilarityServiceImpl; + +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; /** * Finds suggestions for a target within a given list of options by applying string-similarity search @@ -21,9 +26,9 @@ public class Suggester { * @param options list of available options where the suggestion is taken from * @return a suggested alternative for target from the options */ - static String determineSingleSuggestion(String target, ArrayList<String> options) { + static String determineSingleSuggestion(String target, ArrayList<String> options) { - service = new StringSimilarityServiceImpl( new JaroWinklerStrategy()); + service = new StringSimilarityServiceImpl(new JaroWinklerStrategy()); return service.findTop(options, target).getKey(); } @@ -37,12 +42,15 @@ static String determineSingleSuggestion(String target, ArrayList<String> options * @param n number of suggestions to return. Should better be lower than options.size() * @return ArrayList of suggestions */ - public static List<String> determineNSuggestions(String target, ArrayList<String> options, int n) { - service = new StringSimilarityServiceImpl( new JaroWinklerStrategy()); + public static List<String> determineNSuggestions(String target, List<String> options, int n) { + service = new StringSimilarityServiceImpl(new JaroWinklerStrategy()); // the "*." operator is the coolest thing in groovy: // applies the method to all elements of the collection (usually known as "map") - return service.findBestN( options, target, n)*.getKey() + return service.findBestN(options, target, n) + .stream() + .map(SimilarityScore::getKey) + .collect(Collectors.toList()); } } diff --git a/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/tools/TrustAllCertificates.java b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/tools/TrustAllCertificates.java new file mode 100644 index 00000000..6fe712db --- /dev/null +++ b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/tools/TrustAllCertificates.java @@ -0,0 +1,60 @@ +package org.aim42.htmlsanitycheck.tools; + +// created by https://www.geekality.net/2013/09/27/java-ignore-ssl-certificate-errors/ + +import javax.net.ssl.*; +import java.security.KeyManagementException; +import java.security.NoSuchAlgorithmException; +import java.security.cert.X509Certificate; + +public final class TrustAllCertificates implements X509TrustManager, HostnameVerifier { + private TrustAllCertificates() { + } + + public static void install() { + try { + TrustAllCertificates trustAll = new TrustAllCertificates(); + + // Install the all-trusting trust manager + SSLContext sc = SSLContext.getInstance("SSL"); + sc.init(null, + new TrustManager[]{trustAll}, + new java.security.SecureRandom()); + HttpsURLConnection.setDefaultSSLSocketFactory(sc.getSocketFactory()); + + // Install the all-trusting host verifier + HttpsURLConnection.setDefaultHostnameVerifier(trustAll); + } catch (NoSuchAlgorithmException | KeyManagementException e) { + throw new RuntimeException("Failed setting up all thrusting certificate manager.", e); + } + } + + public X509Certificate[] getAcceptedIssuers() { + return null; + } + + public void checkClientTrusted(X509Certificate[] certs, String authType) { + } + + public void checkServerTrusted(X509Certificate[] certs, String authType) { + } + + public boolean verify(String hostname, SSLSession session) { + return true; + } +} + +/*============================================================= + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an + "AS IS" BASIS,WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + =============================================================*/ diff --git a/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/tools/Web.java b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/tools/Web.java new file mode 100644 index 00000000..6c8d8709 --- /dev/null +++ b/htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/tools/Web.java @@ -0,0 +1,290 @@ +package org.aim42.htmlsanitycheck.tools; + +import java.io.IOException; +import java.net.*; +import java.util.HashSet; +import java.util.Set; +import java.util.regex.Pattern; +import java.util.stream.IntStream; + +public class Web { + + + /** + * functions to identify categories of string-representations of URLs and URIs, + * e.g. isRemote, isCrossReference, isValidIP + */ + + // these are regarded as "Success" when checking + // http(s) links + public static final Set<Integer> HTTP_SUCCESS_CODES = initErrorCodes(200, 208, 226, 226); + + public static final Set<Integer> HTTP_WARNING_CODES = initErrorCodes(100, 102, 300, 308); + + public static final Set<Integer> HTTP_ERROR_CODES = initErrorCodes(400, 451, 500, 511); + + public static final Set<Integer> HTTP_REDIRECT_CODES = initErrorCodes(301, 303, 307, 308); + + public static final Set<String> POSSIBLE_EXTENSIONS = initExtentions(); + + static private final Pattern httpPatter = Pattern.compile("^https?"); + + static private final Pattern mailPattern = Pattern.compile("^(?i)(mailto):.*$"); + + private static final Pattern ipPattern = Pattern.compile("\"\\\\d{1,3}\\\\.\\\\d{1,3}\\\\.\\\\d{1,3}\\\\.\\\\d{1,3}\""); + + private static final Pattern dataImagePattern = Pattern.compile("^(?i)(data:image).*$"); + + private static final Pattern remoteImagePattern = Pattern.compile("^(?i)(https?|ftp|telnet|ssh|ssl|gopher|localhost)://.*"); + + private static final Pattern linkPattern = Pattern.compile("^//.*$"); + + private static Set<Integer> initErrorCodes(int alow, int ahigh, int blow, int high) { + Set<Integer> result = IntStream.rangeClosed(alow, ahigh).collect(HashSet::new, Set::add, Set::addAll); + result.addAll(IntStream.rangeClosed(blow, high).collect(HashSet::new, Set::add, Set::addAll)); + return result; + } + + private static Set<String> initExtentions() { + Set<String> result = new HashSet<>(8); + result.add("html"); + result.add("htm"); + result.add("shtml"); + result.add("phtml"); + result.add("php"); + result.add("asp"); + result.add("aspx"); + result.add("xml"); + return result; + } + + + /** + * We try to check if there is a usable Internet connection available. + * Our approximation is DNS resolution: if google.com can be resolved to an IP address, + * there should be an active and usable internet connection available. + * + * @return true if Internet is (seemingly available + */ + static public boolean isInternetConnectionAvailable() { + + try { + // if we can get google's address, there is Internet... + InetAddress.getByName("google.com"); + return true; + } catch (UnknownHostException e) { + // we cannot resolve google, there might be no internet connection + return false; + } + } + + static public boolean isWebUrl(String possibleUrl) { + return httpPatter.matcher(possibleUrl).find(); + } + + public static boolean isLocahost(URL url) { + String host = url.getHost(); + return host.equals("localhost") || host.startsWith("127.0.0"); + } + + public static boolean isIP(URL url) { + return isIP(url.getHost()); + } + + public static boolean isIP(String url) { + return ipPattern.matcher(url).find(); + } + + public static HttpURLConnection getNewURLConnection(URL url) throws IOException { + + TrustAllCertificates.install(); + + HttpURLConnection connection = (HttpURLConnection) url.openConnection(); + connection.setRequestMethod("HEAD"); + + // httpConnectionTimeout defaults to 5000 (msec) + connection.setConnectTimeout(5000); + + // to avoid nasty 403 errors (forbidden), we set a referrer and user-agent + // + connection.setRequestProperty("Referer", "https://aim42.org"); + connection.setRequestProperty("User-Agent", "Mozilla/5.0 (X11; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0"); + + // TODO followRedirects should be a configuration parameter + // that defaults to false + + return connection; + } + + public static boolean isSuccessCode(int responseCode) { + return HTTP_SUCCESS_CODES.contains(responseCode); + } + + public static boolean isRedirectCode(int responseCode) { + return HTTP_REDIRECT_CODES.contains(responseCode); + } + + public static boolean isWarningCode(int responseCode) { + return HTTP_WARNING_CODES.contains(responseCode); + } + + public static boolean isErrorCode(int responseCode) { + return HTTP_ERROR_CODES.contains(responseCode); + } + + + /** + * Checks if this String represents a remote URL + * (startsWith http, https, ftp, telnet...) + * + * @param link + */ + public static boolean isRemoteURL(String imgSrc) { + return remoteImagePattern.matcher(imgSrc).find() + || mailPattern.matcher(imgSrc).find() + || isIP(imgSrc); + } + + + /** + * Checks if this String represents a data-image-URI + * (startsWith "data:image" + * + * @param s + */ + public static boolean isDataURI(String imgSrc) { + return dataImagePattern.matcher(imgSrc).find(); + } + + + /** + * Checks if this String represents a cross-reference, + * that is an intra-document link + * + * @param xref + */ + public static boolean isCrossReference(String xref) { + + // the simple test is if the xref starts with "#" + + return (xref.startsWith("#") && !containsInvalidChars(xref)); + + } + + /** + * helper to identify invalid characters in link + * + * @param aLink + */ + public static boolean containsInvalidChars(String aLink) { + // TODO finding illegal chars with a regex is overly simple, + // as different chars are allowed in different parts of an URI... + // simple solution works for htmlSanityCheck + + + Pattern illegalCharsRegex = Pattern.compile(" |\\*|\\$"); + + return illegalCharsRegex.matcher(aLink).find(); + } + + /** + * Checks if this String represents a local resource, either: + * (1) "file://path/filename.ext" or + * (2) is a path, e.g. "directory/filename.ext" or directory or + * (3) starts with //, e.g. "index.html" + * + * @see class URLUtilSpec for details + */ + public static boolean isLocalResource(String link) { + // handle corner cases + if ((link == null) + || containsInvalidChars(link) + || (link.isEmpty()) + || isCrossReference(link) // "#link" or similar + || isRemoteURL(link) // "mailto:", "http" etc + + ) + return false; + + else { + URI aUri; + try { + aUri = new URI(link); + } catch (URISyntaxException e) { + throw new RuntimeException(e); + } + + return ( + (isLinkToFile(aUri)) // (1) + || + linkPattern.matcher(link).find() // (3) + || + (!aUri.getPath().isEmpty()) // (2) + ); + } + } + + + /* + ** helper to identify "file scheme" + */ + private static Boolean isLinkToFile(URI aUri) { + if (aUri == null || aUri.getScheme() == null) { + return false; + } + + return aUri.getScheme().equalsIgnoreCase("file"); + } + + + /** + * Checks if this String represents a valid URI/URL + * + * @param link + * @return boolean + */ + public static boolean isValidURL(String link) { + // TODO: refactor this code to use org.apache.commons.validator.routines.* + + boolean isValid = false; + + if (isCrossReference(link)) { + return true; + + } else { + try { + URI aUri = new URL(link).toURI(); + isValid = true; + } catch (MalformedURLException e) { + isValid = false; + // ignore + + } catch (URISyntaxException e1) { + isValid = false; + } + } + + return isValid; + } +} + + +/************************************************************************ + * This is free software - without ANY guarantee! + * + * + * Copyright Dr. Gernot Starke, arc42.org + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + *********************************************************************** */ \ No newline at end of file diff --git a/htmlSanityCheck-core/src/main/java/org/aim42/net/TrustAllCertificates.java b/htmlSanityCheck-core/src/main/java/org/aim42/net/TrustAllCertificates.java index 24ed8f79..9b89b8fe 100644 --- a/htmlSanityCheck-core/src/main/java/org/aim42/net/TrustAllCertificates.java +++ b/htmlSanityCheck-core/src/main/java/org/aim42/net/TrustAllCertificates.java @@ -2,30 +2,17 @@ // created by https://www.geekality.net/2013/09/27/java-ignore-ssl-certificate-errors/ -import javax.net.ssl.HostnameVerifier; -import javax.net.ssl.HttpsURLConnection; -import javax.net.ssl.SSLContext; -import javax.net.ssl.SSLSession; -import javax.net.ssl.TrustManager; -import javax.net.ssl.X509TrustManager; +import javax.net.ssl.*; import java.security.KeyManagementException; import java.security.NoSuchAlgorithmException; import java.security.cert.X509Certificate; -public final class TrustAllCertificates implements X509TrustManager, HostnameVerifier -{ - public X509Certificate[] getAcceptedIssuers() {return null;} - public void checkClientTrusted(X509Certificate[] certs, String authType) {} - public void checkServerTrusted(X509Certificate[] certs, String authType) {} - public boolean verify(String hostname, SSLSession session) {return true;} - +public final class TrustAllCertificates implements X509TrustManager, HostnameVerifier { /** * Installs a new {@link TrustAllCertificates} as trust manager and hostname verifier. */ - public static void install() - { - try - { + public static void install() { + try { TrustAllCertificates trustAll = new TrustAllCertificates(); // Install the all-trusting trust manager @@ -37,16 +24,26 @@ public static void install() // Install the all-trusting host verifier HttpsURLConnection.setDefaultHostnameVerifier(trustAll); - } - catch (NoSuchAlgorithmException e) - { + } catch (NoSuchAlgorithmException e) { throw new RuntimeException("Failed setting up all thrusting certificate manager.", e); - } - catch (KeyManagementException e) - { + } catch (KeyManagementException e) { throw new RuntimeException("Failed setting up all thrusting certificate manager.", e); } } + + public X509Certificate[] getAcceptedIssuers() { + return null; + } + + public void checkClientTrusted(X509Certificate[] certs, String authType) { + } + + public void checkServerTrusted(X509Certificate[] certs, String authType) { + } + + public boolean verify(String hostname, SSLSession session) { + return true; + } } /*============================================================= diff --git a/htmlSanityCheck-core/src/test/groovy/org/aim42/htmlsanitycheck/check/BrokenHttpLinksCheckerSpec.groovy b/htmlSanityCheck-core/src/test/groovy/org/aim42/htmlsanitycheck/check/BrokenHttpLinksCheckerSpec.groovy index 0930620e..19a3901a 100644 --- a/htmlSanityCheck-core/src/test/groovy/org/aim42/htmlsanitycheck/check/BrokenHttpLinksCheckerSpec.groovy +++ b/htmlSanityCheck-core/src/test/groovy/org/aim42/htmlsanitycheck/check/BrokenHttpLinksCheckerSpec.groovy @@ -4,7 +4,7 @@ import org.aim42.htmlsanitycheck.Configuration import org.aim42.htmlsanitycheck.collect.SingleCheckResults import org.aim42.htmlsanitycheck.html.HtmlConst import org.aim42.htmlsanitycheck.html.HtmlPage -import org.aim42.inet.NetUtil +import org.aim42.htmlsanitycheck.tools.Web import spock.lang.Ignore import spock.lang.IgnoreIf import spock.lang.Specification @@ -43,7 +43,7 @@ class BrokenHttpLinksCheckerSpec extends Specification { @IgnoreIf({ Boolean.valueOf(env['INTELLIJ']) }) def "recognize if there is internet connectivity"() { expect: "if there is no internet connection, testing should fail" - NetUtil.isInternetConnectionAvailable() == true + Web.isInternetConnectionAvailable() == true } diff --git a/htmlSanityCheck-core/src/test/groovy/org/aim42/htmlsanitycheck/html/HtmlPageTest.groovy b/htmlSanityCheck-core/src/test/groovy/org/aim42/htmlsanitycheck/html/HtmlPageTest.groovy index a0650a4e..e108a773 100644 --- a/htmlSanityCheck-core/src/test/groovy/org/aim42/htmlsanitycheck/html/HtmlPageTest.groovy +++ b/htmlSanityCheck-core/src/test/groovy/org/aim42/htmlsanitycheck/html/HtmlPageTest.groovy @@ -2,6 +2,7 @@ package org.aim42.htmlsanitycheck.html import org.junit.Before import org.junit.Test +import org.aim42.htmlsanitycheck.tools.Web import static org.junit.Assert.assertEquals import static org.junit.Assert.assertTrue @@ -278,7 +279,7 @@ class HtmlPageTest { // now filter the local resources List<String> localHrefStrings = hrefs.findAll { hrefString -> - URLUtil.isLocalResource(hrefString) + Web.isLocalResource(hrefString) } assertEquals("expected 2 local resources", 2, localHrefStrings.size()) diff --git a/htmlSanityCheck-core/src/test/groovy/org/aim42/htmlsanitycheck/html/URLUtilSpec.groovy b/htmlSanityCheck-core/src/test/groovy/org/aim42/htmlsanitycheck/html/URLUtilSpec.groovy index 46f3acbe..c2495065 100644 --- a/htmlSanityCheck-core/src/test/groovy/org/aim42/htmlsanitycheck/html/URLUtilSpec.groovy +++ b/htmlSanityCheck-core/src/test/groovy/org/aim42/htmlsanitycheck/html/URLUtilSpec.groovy @@ -1,5 +1,6 @@ package org.aim42.htmlsanitycheck.html +import org.aim42.htmlsanitycheck.tools.Web import spock.lang.Specification import spock.lang.Unroll @@ -8,7 +9,7 @@ class URLUtilSpec extends Specification { //@Unroll def "invalid chars in link"(boolean containsInvalidChars, String link) { expect: - URLUtil.containsInvalidChars( link ) == containsInvalidChars + Web.containsInvalidChars( link ) == containsInvalidChars where: @@ -28,7 +29,7 @@ class URLUtilSpec extends Specification { @Unroll def "identify invalid links"(boolean isValid, String link) { expect: - URLUtil.isValidURL( link ) == isValid + Web.isValidURL( link ) == isValid where: @@ -46,7 +47,7 @@ class URLUtilSpec extends Specification { def "identify local resource links"(boolean isLocal, String link) { expect: - URLUtil.isLocalResource( link ) == isLocal + Web.isLocalResource( link ) == isLocal where: @@ -86,7 +87,7 @@ class URLUtilSpec extends Specification { @Unroll def "check for valid ip address"(boolean isValidIP, String ipa) { expect: - URLUtil.isValidIP(ipa) == isValidIP + Web.isIP(ipa) == isValidIP where: diff --git a/htmlSanityCheck-core/src/test/groovy/org/aim42/htmlsanitycheck/html/URLUtilTest.groovy b/htmlSanityCheck-core/src/test/groovy/org/aim42/htmlsanitycheck/html/URLUtilTest.groovy index 11e30948..c83de479 100644 --- a/htmlSanityCheck-core/src/test/groovy/org/aim42/htmlsanitycheck/html/URLUtilTest.groovy +++ b/htmlSanityCheck-core/src/test/groovy/org/aim42/htmlsanitycheck/html/URLUtilTest.groovy @@ -1,6 +1,7 @@ package org.aim42.htmlsanitycheck.html import org.junit.Test +import org.aim42.htmlsanitycheck.tools.Web import static org.junit.Assert.assertFalse import static org.junit.Assert.assertTrue @@ -17,13 +18,13 @@ class URLUtilTest { public void testFileURL() { String fileURL = "file://$IMG" - Boolean actual = URLUtil.isRemoteURL(fileURL) + Boolean actual = Web.isRemoteURL(fileURL) assertFalse("$fileURL shall be recognized as file but wasn't", actual) fileURL = "$IMG" - actual = URLUtil.isRemoteURL(fileURL) + actual = Web.isRemoteURL(fileURL) assertFalse("$fileURL is file-url but not remote-url", actual) @@ -34,7 +35,7 @@ class URLUtilTest { @Test public void testHTTPUrl() { String httpURL = "http://$AIM/$IMG" - Boolean actual = URLUtil.isRemoteURL(httpURL) + Boolean actual = Web.isRemoteURL(httpURL) assertTrue("$httpURL shall be recognized as remote url but wasn't", actual) } @@ -49,7 +50,7 @@ class URLUtilTest { prefixes.each { prefix -> String url = prefix + "://$AIM/$IMG" assertTrue("$prefix is remote URL but wasnt recognized", - URLUtil.isRemoteURL(url)) + Web.isRemoteURL(url)) } } @@ -60,7 +61,7 @@ class URLUtilTest { prefixes.each { prefix -> String url = prefix + ":chuck.norris@example.com" assertTrue("$prefix is mailto-link but wasnt recognized", - URLUtil.isRemoteURL(url)) + Web.isRemoteURL(url)) } } @@ -73,7 +74,7 @@ class URLUtilTest { paths.each { url -> assertFalse("$url is local but was recognized as remote", - URLUtil.isRemoteURL(url)) + Web.isRemoteURL(url)) } } @@ -90,14 +91,14 @@ class URLUtilTest { "./docs/test.html#anchor"] locals.each { it -> - assertTrue("$it not recognized as local resource", URLUtil.isLocalResource(it)) + assertTrue("$it not recognized as local resource", Web.isLocalResource(it)) } List<String> remotes = ["http://google.com", "mailto:/hello@example.com", "ftp://file.html", "https://github.com"] remotes.each { - assertFalse("$it recognized as local resource", URLUtil.isLocalResource(it)) + assertFalse("$it recognized as local resource", Web.isLocalResource(it)) } } @@ -110,7 +111,7 @@ class URLUtilTest { List<String> localResources = ["file://test.html", "FILE://test.html"] localResources.each { it -> - assertTrue("$it not recognized as local resource", URLUtil.isLocalResource(it)) + assertTrue("$it not recognized as local resource", Web.isLocalResource(it)) } } finally { Locale.setDefault(defaultLocale) @@ -127,7 +128,7 @@ class URLUtilTest { crossRefs.each { cf -> //log.info(cf) assertTrue("$cf legal cross-reference not recognized", - URLUtil.isCrossReference(cf)) + Web.isCrossReference(cf)) } } @@ -142,7 +143,7 @@ class URLUtilTest { paths.each { path -> assertFalse("$path recognized as cross-reference", - URLUtil.isCrossReference(path)) + Web.isCrossReference(path)) } diff --git a/htmlSanityCheck-core/src/test/groovy/org/aim42/htmlsanitycheck/report/SummarizerUtilSpec.groovy b/htmlSanityCheck-core/src/test/groovy/org/aim42/htmlsanitycheck/report/SummarizerUtilSpec.groovy index 63da7d89..6c1417f7 100644 --- a/htmlSanityCheck-core/src/test/groovy/org/aim42/htmlsanitycheck/report/SummarizerUtilSpec.groovy +++ b/htmlSanityCheck-core/src/test/groovy/org/aim42/htmlsanitycheck/report/SummarizerUtilSpec.groovy @@ -66,6 +66,7 @@ class SummarizerUtilSpec extends Specification { 10 | 3 | 70 3 | 1 | 66 3 | 2 | 33 + 200 | 99 | 50 1000 | 250 | 75 } diff --git a/htmlSanityCheck-core/src/test/groovy/org/aim42/inet/NetUtilSpec.groovy b/htmlSanityCheck-core/src/test/groovy/org/aim42/inet/NetUtilSpec.groovy index a40b72e6..a9c98a54 100644 --- a/htmlSanityCheck-core/src/test/groovy/org/aim42/inet/NetUtilSpec.groovy +++ b/htmlSanityCheck-core/src/test/groovy/org/aim42/inet/NetUtilSpec.groovy @@ -1,5 +1,6 @@ package org.aim42.inet +import org.aim42.htmlsanitycheck.tools.Web import spock.lang.Specification import spock.lang.Unroll @@ -12,7 +13,7 @@ class NetUtilSpec extends Specification { @Unroll def "success return codes contain #successCode"() { expect: - successCode in NetUtil.HTTP_SUCCESS_CODES + successCode in Web.HTTP_SUCCESS_CODES where: successCode << [200,201,202] @@ -21,7 +22,7 @@ class NetUtilSpec extends Specification { @Unroll def "error codes contain #errorCode"() { expect: - errorCode in NetUtil.HTTP_ERROR_CODES + errorCode in Web.HTTP_ERROR_CODES where: errorCode << [400,401,402,403,404,405,406,407,408,409,410,500,501,502,503,504,505] @@ -36,6 +37,6 @@ class NetUtilSpec extends Specification { expect: - NetUtil.isInternetConnectionAvailable() == true + Web.isInternetConnectionAvailable() == true } } \ No newline at end of file diff --git a/src/docs/arc42/chapters/_config.adoc b/src/docs/arc42/chapters/_config.adoc index 582f1acc..2cf24631 100644 --- a/src/docs/arc42/chapters/_config.adoc +++ b/src/docs/arc42/chapters/_config.adoc @@ -23,7 +23,7 @@ ifndef::imagesdir[:imagesdir: ../images] :project-repository-docs-link: https://github.com/aim42/htmlSanityCheck/blob/main/src/docs/ :project-repository-docs-edit-link: https://github.com/aim42/htmlSanityCheck/edit/main/src/docs/ -:coresourcepath: {projectRootDir}/htmlSanityCheck-core/src/main/groovy/org/aim42 +:coresourcepath: {projectRootDir}/htmlSanityCheck-core/src/main/java/org/aim42 :coretestpath: {projectRootDir}/htmlSanityCheck-core/src/test/groovy/org/aim42 :asciidoctor-gradle-plugin-url: https://github.com/asciidoctor/asciidoctor-gradle-plugin diff --git a/src/docs/arc42/chapters/chap-05-BuildingBlocks.adoc b/src/docs/arc42/chapters/chap-05-BuildingBlocks.adoc index e6e2a100..fee0235b 100644 --- a/src/docs/arc42/chapters/chap-05-BuildingBlocks.adoc +++ b/src/docs/arc42/chapters/chap-05-BuildingBlocks.adoc @@ -190,7 +190,7 @@ overall `RunResults`, single-page results (`PageResults`) and single-check resul [source, groovy] .Interface RunResults ---- -include::{coresourcepath}/htmlsanitycheck/collect/RunResults.groovy[tags=RunResultInterface] +include::{coresourcepath}/htmlsanitycheck/collect/RunResults.java[tags=RunResultInterface] ---- @@ -198,12 +198,12 @@ include::{coresourcepath}/htmlsanitycheck/collect/RunResults.groovy[tags=RunResu [source, groovy] .Interface PageResults ---- -include::{coresourcepath}/htmlsanitycheck/collect/PageResults.groovy[tags=PageResultInterface] +include::{coresourcepath}/htmlsanitycheck/collect/PageResults.java[tags=PageResultInterface] ---- [source, groovy] .Interface CheckResults ---- -include::{coresourcepath}/htmlsanitycheck/collect/CheckResults.groovy[tags=CheckResultsInterface] +include::{coresourcepath}/htmlsanitycheck/collect/CheckResults.java[tags=CheckResultsInterface] ---- diff --git a/src/docs/development/issue-252.adoc b/src/docs/development/issue-252.adoc index ff2551b2..4cc5dabf 100644 --- a/src/docs/development/issue-252.adoc +++ b/src/docs/development/issue-252.adoc @@ -41,7 +41,7 @@ for URLs with `prefixOnlyHref`. the default should be the list given above (`ITEM_NAME_prefixOnlyHrefExtensions`) -* Add these defaults to `NetUtil` +* Add these defaults to `Web`