From e3b8c5540e90a964b655be6ca73c1ee1aeba2719 Mon Sep 17 00:00:00 2001 From: Aditya Mitra <55396651+aditya-mitra@users.noreply.github.com> Date: Thu, 24 Jun 2021 01:53:21 +0530 Subject: [PATCH] refactor(perf): make the action faster (#3) ## Summary Instead of using the docker image by algolia for docsearch, this pr uses the source repository for scrapping and uploading to algolia. ## Details Using the source repository, removes the use of jq, installation of docker-cli, the algolia docsearch docker image, and other peer dependencies. Using the python:3.6 as the base image (which comes with git preinstalled), first the algolia-docsearch repository is git cloned. pipenv is installed and then pipenv installed the packages in the Pipfile. ## Improvements The running time of the action has now reduced by 40 seconds. ## Further Comments I have made a few other fixes/corrections like correcting the spelling of algolia. Also, I changed the config.example.json since it took a lot of time to index and the difference could not have been made clear. Closes #2 --- .github/workflows/main.yml | 9 +++++--- Dockerfile | 2 +- README.md | 9 ++++---- action.yml | 4 ++-- config.example.json | 47 +++++++++++--------------------------- entrypoint.sh | 34 +++++++++++++++------------ 6 files changed, 46 insertions(+), 59 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 6fb6bf1..c5a8f68 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -1,13 +1,16 @@ -on: [push] +name: Test the Action + +on: + - push jobs: example_job: runs-on: ubuntu-latest name: test the action steps: - - uses: actions/checkout@master + - uses: actions/checkout@v2 - name: test - uses: darrenjennings/algolia-docsearch-action@master + uses: ./ with: algolia_api_key: ${{ secrets.ALGOLIA_API_KEY }} algolia_application_id: ${{ secrets.ALGOLIA_APPLICATION_ID }} diff --git a/Dockerfile b/Dockerfile index 40a7280..1d48560 100755 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM ubuntu:latest +FROM python:3.6 COPY entrypoint.sh /entrypoint.sh diff --git a/README.md b/README.md index 0ee9567..e4ebbbd 100644 --- a/README.md +++ b/README.md @@ -5,19 +5,18 @@ This action runs the docsearch scraper and updates an index. ## Inputs ### `algolia_application_id` -**Required** 'Aloglia docsearch `APPLICATION_ID` +**Required** Algolia docsearch `APPLICATION_ID` ### `algolia_api_key` -**Required** Aloglia docsearch `API_KEY` +**Required** Algolia docsearch `API_KEY` ### `file` -**Required** File able to be accessed from $GITHUB_WORKSPACE, used in tandem -with `actions/checkout@master` +**Required** File able to be accessed from $GITHUB_WORKSPACE, used in tandem with `actions/checkout@master` ## Example usage ```yaml -- uses: actions/checkout@master +- uses: actions/checkout@v2 - uses: darrenjennings/algolia-docsearch-action@master with: algolia_application_id: 'XXXXX83LWT' diff --git a/action.yml b/action.yml index 7dd91d6..d37c0be 100644 --- a/action.yml +++ b/action.yml @@ -5,10 +5,10 @@ branding: color: 'blue' inputs: algolia_application_id: - description: 'Aloglia docsearch APPLICATION_ID' + description: 'Algolia docsearch APPLICATION_ID' required: true algolia_api_key: - description: 'Aloglia docsearch API_KEY' + description: 'Algolia docsearch API_KEY' required: true file: description: 'File path to docsearch' diff --git a/config.example.json b/config.example.json index f0a710d..bb44f4e 100644 --- a/config.example.json +++ b/config.example.json @@ -1,41 +1,20 @@ { - "index_name": "prod_EE", + "index_name": "algolia_docsearch_action", "start_urls": [ - { - "url": "https://docs.konghq.com/enterprise/(?P.*?)/", - "variables": { - "version": { - "url": "https://docs.konghq.com/enterprise/", - "js": "var versions = $('ul[aria-labelledby=version-dropdown] a, button#version-dropdown').map(function(i, e) { return $(e).text().replace(/\\s+/g, '').replace(/Version/g, '').replace('(2020)', '').replace('(latest)', ''); }).toArray(); return JSON.stringify(versions);" - } - } - } - ], - "sitemap_urls": [ - "https://docs.konghq.com/sitemap.xml" - ], - "stop_urls": [ - + "https://aquaimpact.github.io/CovidSusTrackerDocs" ], + "stop_urls": [], "selectors": { - "lvl0": { - "selector": ".docs-navigation > a.active", - "global": true, - "default_value": "Kong" - }, - "lvl1": ".content h1", - "lvl2": ".content h2", - "lvl3": ".content h3", - "lvl4": ".content h4", - "text": ".content p, .content li" + "lvl0": ".doc-content h1", + "lvl1": ".doc-content h2", + "lvl2": ".doc-content h3", + "lvl3": ".doc-content h4", + "lvl4": ".doc-content h5", + "lvl5": ".doc-content h6", + "text": ".doc-content p, .doc-content li" }, - "selectors_exclude": [ - "#next-steps", - "#next-steps ~ p" - ], - "only_content_level": true, "conversation_id": [ - "534091583" + "1313246279" ], - "nb_hits": 18645 -} + "nb_hits": 98 +} \ No newline at end of file diff --git a/entrypoint.sh b/entrypoint.sh index 4d15474..d2d2fd1 100755 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -4,17 +4,23 @@ APPLICATION_ID=$1 API_KEY=$2 FILE=$3 -apt update -apt install jq -y - -# install docker -apt install apt-transport-https ca-certificates curl software-properties-common -y -curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add - -add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu bionic stable" -apt update -apt-cache policy docker-ce -apt install docker-ce -y - -ls -la $GITHUB_WORKSPACE -cat $GITHUB_WORKSPACE/$FILE | jq -r tostring -docker run -e APPLICATION_ID=$APPLICATION_ID -e API_KEY=$API_KEY -e "CONFIG=$(cat $GITHUB_WORKSPACE/$FILE | jq -r tostring)" algolia/docsearch-scraper +# build from the main source repository +git clone https://github.com/algolia/docsearch-scraper.git + +cd docsearch-scraper/ + +# install pipenv without cache +pip install --no-cache-dir --trusted-host pypi.python.org pipenv + +# install packages without virtualenv +pipenv install --system --deploy --ignore-pipfile + +# create the .env file for docsearch +echo "APPLICATION_ID=${APPLICATION_ID} +API_KEY=${API_KEY} +" > .env + +# run algolia docsearch +python docsearch run $GITHUB_WORKSPACE/$FILE + +echo "🚀 Successfully indexed and uploaded the results to Algolia" \ No newline at end of file