diff --git a/config/tech-docs.yml b/config/tech-docs.yml index 901cb67c75..ef1388defb 100644 --- a/config/tech-docs.yml +++ b/config/tech-docs.yml @@ -79,8 +79,8 @@ redirects: /apps/by-team.html: /apps.html /apps/ckan-functional-tests.html: /repos/ckan-functional-tests.html /apps/ckanext-datagovuk.html: /repos/ckanext-datagovuk.html - /apps/collections.html: /repos/collections.html /apps/collections-publisher.html: /repos/collections-publisher.html + /apps/collections.html: /repos/collections.html /apps/contacts-admin.html: /repos/contacts-admin.html /apps/contacts-frontend.html: /repos/contacts-frontend.html /apps/content-data-admin.html: /repos/content-data-admin.html @@ -111,18 +111,18 @@ redirects: /apps/govuk_content_api.html: /repos/govuk_content_api.html /apps/govuk_schemas.html: /repos/govuk_schemas.html /apps/hmrc-manuals-api.html: /repos/hmrc-manuals-api.html - /apps/places-manager.html: /repos/places-manager.html /apps/licensify.html: /repos/licensify.html /apps/link-checker-api.html: /repos/link-checker-api.html /apps/local-links-manager.html: /repos/local-links-manager.html /apps/locations-api.html: /repos/locations-api.html /apps/manuals-publisher.html: /repos/manuals-publisher.html /apps/maslow.html: /repos/maslow.html + /apps/places-manager.html: /repos/places-manager.html /apps/publisher.html: /repos/publisher.html /apps/publishing-api.html: /repos/publishing-api.html /apps/release.html: /repos/release.html - /apps/router.html: /repos/router.html /apps/router-api.html: /repos/router-api.html + /apps/router.html: /repos/router.html /apps/seal.html: /repos/seal.html /apps/search-admin.html: /repos/search-admin.html /apps/search-api.html: /repos/search-api.html @@ -134,8 +134,8 @@ redirects: /apps/special-route-publisher.html: /repos/special-route-publisher.html /apps/specialist-publisher.html: /repos/specialist-publisher.html /apps/static.html: /repos/static.html - /apps/support.html: /repos/support.html /apps/support-api.html: /repos/support-api.html + /apps/support.html: /repos/support.html /apps/transition.html: /repos/transition.html /apps/travel-advice-publisher.html: /repos/travel-advice-publisher.html /apps/whitehall.html: /repos/whitehall.html @@ -144,7 +144,11 @@ redirects: /guides.html: /manual.html /manual/access-aws-console.html: /manual/get-started.html /manual/add-a-pingdom-check.html: /manual/pingdom.html + /manual/alerts/content-data-api-app-healthcheck-not-ok.html: /manual/content-data-api-app-healthcheck-not-ok.html + /manual/alerts/elasticsearch-cluster-health.html: /manual/elasticsearch-cluster-health.html /manual/alerts/email-alerts.html: /manual/alerts/email-alerts-travel-medical.html + /manual/alerts/publisher-unprocessed-fact-check-emails.html: /manual/publisher-unprocessed-fact-check-emails.html + /manual/alerts/whitehall-scheduled-publishing.html: /manual/whitehall-scheduled-publishing.html /manual/archiving-and-redirecting-content.html: /manual/redirect-routes.html /manual/bouncer.html: /manual/transition-architecture.html /manual/bulk-email.html: /apps/email-alert-api/bulk-email.html @@ -164,15 +168,14 @@ redirects: /manual/elasticsearch.html: /manual/elasticsearch-dumps.html /manual/emergency-publishing-redis.html: /manual/emergency-publishing.html /manual/error-reporting.html: /manual/sentry.html - /manual/fastly-error-rate.html: /manual/alerts/fastly-error-rate.html /manual/fix-problems-with-vagrant.html: /manual.html /manual/gds-cli.html: /manual/access-aws-console.html /manual/github-access.html: /manual/github.html /manual/give-a-content-designer-access-to-github.html: /manual/github.html /manual/howto-manually-remove-assets.html: /manual/manage-assets.html /manual/howto-merge-a-pull-request-from-an-external-contributor.html: /manual/merge-pr.html - /manual/howto-replace-an-assets-file.html: /manual/manage-assets.html /manual/howto-remove-change-note-from-whitehall.html: /manual/howto-remove-change-note.html + /manual/howto-replace-an-assets-file.html: /manual/manage-assets.html /manual/howto-transition-a-site-to-govuk.html: /manual/transition-a-site.html /manual/howto-upload-an-asset-to-asset-manager.html: /manual/manage-assets.html /manual/intro-to-docker-even-more.html: /manual/how-govuk-docker-works.html diff --git a/source/manual/alerts/aws-rds-memory.html.md b/source/manual/alerts/aws-rds-memory.html.md deleted file mode 100644 index 075469d59b..0000000000 --- a/source/manual/alerts/aws-rds-memory.html.md +++ /dev/null @@ -1,13 +0,0 @@ ---- -owner_slack: "#govuk-2ndline-tech" -title: AWS RDS Instance Memory Utilization -parent: "/manual.html" -layout: manual_layout -section: Icinga alerts ---- - -This alert relates to memory usage of our database (RDS) instances in AWS. To check the current usage: - -- [Access the AWS web console][] and view the statistics. - -[Access the AWS web console]: https://eu-west-1.console.aws.amazon.com/rds/home?region=eu-west-1 diff --git a/source/manual/alerts/aws-rds-storage.html.md b/source/manual/alerts/aws-rds-storage.html.md deleted file mode 100644 index 50da6f90b7..0000000000 --- a/source/manual/alerts/aws-rds-storage.html.md +++ /dev/null @@ -1,13 +0,0 @@ ---- -owner_slack: "#govuk-2ndline-tech" -title: AWS RDS Instance Storage Utilization -parent: "/manual.html" -layout: manual_layout -section: Icinga alerts ---- - -This alert relates to disk usage of our databases (RDS) in AWS being higher than we would expect. To check the current usage. - -- [Access the AWS web console][] and view the statistics. - -[Access the AWS web console]: https://eu-west-1.console.aws.amazon.com/rds/home?region=eu-west-1 diff --git a/source/manual/alerts/check-status-gcp-mirror-sync-job.html.md b/source/manual/alerts/check-status-gcp-mirror-sync-job.html.md deleted file mode 100644 index 27180838ba..0000000000 --- a/source/manual/alerts/check-status-gcp-mirror-sync-job.html.md +++ /dev/null @@ -1,92 +0,0 @@ ---- -owner_slack: "#govuk-2ndline-tech" -title: Check status of latest GCP mirror sync job -section: Icinga alerts -layout: manual_layout -parent: "/manual.html" ---- - -This alert means that the latest mirror sync job from AWS S3 to GCP GCS Google Cloud Storage (GCS) failed. The mirror sync job exists to ensure that we have an up-to-date copy of the mirrored contents of GOV.UK from the AWS S3 bucket `govuk-production-mirror` within the same-named bucket in GCS, should we ever need to [fall back to the static mirror][fallback to mirror] hosted on GCP. The job lives within GCP Data Transfer and runs everyday at 18:00 UTC. - -Occasionally we see errors during the mirror sync process, such as files not being found. Previously there was no remedial action we could take on these errors as the Data Transfer API was broken, however now that it has been fixed it is straightforward to retry the failed job. - -## Manually retrying the mirror sync job - -### Prerequisites: - -- You need to have production access. -- You need to have been added to the appropriate Google group. - -### Via the GCP Console: - -1. Go to the [Data Transfer][] section in the GCP Console, ensuring you're viewing the GOV.UK Production project. -2. Click on the transfer job named _daily sync of the primary govuk-production-mirror S3 bucket_. -3. Click on the _Start a run_ button at the top of the page to queue a new transfer operation. -4. A new transfer operation will appear below in the _Run history_ table, which you can tail by clicking its start time. - -### Via the GCP Data Transfer API: - -1. SSH into the Monitoring box: `gds govuk connect -e production ssh aws/monitoring`. -2. Run the following to initiate a new transfer operation: - -```sh -export GOOGLE_APPLICATION_CREDENTIALS=/etc/govuk/gcloud_auth.json -gcp_project_id=govuk-production -gcp_transfer_service_token=$(gcloud auth application-default print-access-token) -transfer_jobs=$(curl --silent --header "Content-Type: application/json" \ - --header "Authorization: Bearer $gcp_transfer_service_token" \ - --request GET "https://storagetransfer.googleapis.com/v1/transferJobs?filter=%7B%22projectId%22%3A%22$gcp_project_id%22%7D") -transfer_job_name=$(echo $transfer_jobs | jq -c -r ".transferJobs[] | select(.description | contains(\"${gcp_project_id}\")) | .name") -transfer_operation=$(curl --header "Content-Type: application/json" \ - --header "Authorization: Bearer $gcp_transfer_service_token" \ - --request POST \ - --data '{"projectId": "govuk-production"}' \ - "https://storagetransfer.googleapis.com/v1/$transfer_job_name:run") -``` - -3. `transfer_operation` should contain a JSON response confirming that a new transfer operation has been queued: - -```json -{ - ... - "metadata": { - "@type": "type.googleapis.com/google.storagetransfer.v1.TransferOperation", - "projectId": "govuk-production", - "transferSpec": { - "awsS3DataSource": { - "bucketName": "govuk-production-mirror" - }, - "gcsDataSink": { - "bucketName": "govuk-production-mirror" - }, - "transferOptions": { - "deleteObjectsUniqueInSink": true - } - }, - "startTime": "2021-07-13T14:23:04.902943622Z", - "status": "QUEUED", - "counters": {}, - ... - }, - ... -} -``` - -4. Finally, you can check the status of the new transfer operation by calling: - -```sh -latest_operation_name=$(echo $transfer_operation | jq -r .name) -latest_operation_details=$(curl --silent --header "Content-Type: application/json" \ - --header "Authorization: Bearer $gcp_transfer_service_token" \ - --request GET "https://storagetransfer.googleapis.com/v1/$latest_operation_name") -echo $latest_operation_details | jq -r '.metadata.status' -``` - -## Further actions: - -If you continue to experience errors with the job after manually forcing a retry, it's possible that the GOV.UK crawler worker hasn't finished crawling or that crawled pages haven't been fully uploaded to S3. This used to be a regular problem until the mirror sync job schedule was moved from 12:00 UTC to 18:00 UTC. It is also inevitable over time as we accrue new pages that the crawler process will take longer to complete, which could also result in the previously mentioned situation. - -When retrying the mirror sync job continues to result in errors, it is advisable to wait until the mirror sync job is next scheduled to run before taking further action. - -[Data Transfer]: https://console.cloud.google.com/transfer/cloud/jobs -[fallback to mirror]: /manual/fall-back-to-mirror.html diff --git a/source/manual/alerts/content-publisher-government-data-check-not-ok.html.md b/source/manual/alerts/content-publisher-government-data-check-not-ok.html.md deleted file mode 100644 index 34e2a32141..0000000000 --- a/source/manual/alerts/content-publisher-government-data-check-not-ok.html.md +++ /dev/null @@ -1,17 +0,0 @@ ---- -owner_slack: "#govuk-pubworkflow-dev" -title: content-publisher government-data check not ok -section: Icinga alerts -layout: manual_layout -parent: "/manual.html" ---- - -This means that Content Publisher is having trouble updating the data it holds on current and previous governments. Ordinarily it reloads this data from the Publishing API every fifteen minutes and seeing this error means it hasn't happened in at least 6 hours. After 24 hours the cache of government data will clear and the app will stop working as expected. The following suggestions should help to isolate the problem. - -- Check [Sentry][] for any recent errors that indicate reasons the job is failing -- Ensure there aren't alerts indicating the Content Publisher sidekiq process isn't running -- Run `PopulateBulkDataJob.perform_now` manually in the [Content Publisher console][console] to see if issues occur. [Link to job][data job] - -[Sentry]: [https://sentry.io/organizations/govuk/issues/?project=1242052] -[data job]: [https://github.com/alphagov/content-publisher/blob/main/app/jobs/populate_bulk_data_job.rb] -[console]: [/manual/get-ssh-access.html#running-a-console] diff --git a/source/manual/alerts/email-alert-api-high-queue-latency.html.md b/source/manual/alerts/email-alert-api-high-queue-latency.html.md deleted file mode 100644 index 1801ae1712..0000000000 --- a/source/manual/alerts/email-alert-api-high-queue-latency.html.md +++ /dev/null @@ -1,59 +0,0 @@ ---- -owner_slack: "#govuk-2ndline-tech" -title: 'Email Alert API: high latency for sidekiq queue' -section: Icinga alerts -subsection: Email alerts -layout: manual_layout -parent: "/manual.html" ---- - -This alert triggers when there is a significant delay in the time from the -Email Alert API system creating an email until it is sent. - -## Understanding the alert - -The latency value itself is a measure, in seconds, of the amount of time that -the oldest email in the [Sidekiq] queue has been waiting. This latency builds -up when there are more emails to send than workers to send them and this alert -triggers once this latency reaches an alarming level. - -## Impact - -The user impact of this alert depends on the queue. A delay for the -`send_email_transactional` queue indicates that users are blocked from -completing user journeys (such as sign-in or sign up), thus a delay of minutes -is frustrating. - -For the other [queues] the impact of delayed email is less significant for -users (there aren't assurances on how rapidly an email should be sent) -and indicates a risk that the system is experiencing significantly -degraded performance and may become perpetually overloaded. For -example, if there aren't sufficient resources to send all of Monday's emails -on Monday we could find there aren't resources to send both Monday _and_ -Tuesday's emails the next day and so forth. - -## How to investigate - -You should be looking for evidence of performance degradation, the presence of -errors and whether an abnormal quantity of emails is being created. Some -diagnostic steps you could take are: - -* check the [Sidekiq dashboard] to understand the level of work Sidekiq is - doing and has to do; -* monitor the [Email Alert API Technical dashboard][technical dash] to see - the rate emails are being sent at and look at broader view of Email Alert API - status; -* check whether workers are raising errors to [Sentry]; -* check [Kibana] to see the Sidekiq logs for Email Alert API; -* you can investigate the health of the [underlying application - machines][machine metrics] and the [RDS PostgeSQL database - instance][postgres dash]. - -[Sidekiq]: /manual/sidekiq.html -[queues]: https://github.com/alphagov/email-alert-api/blob/main/config/sidekiq.yml -[Sidekiq dashboard]: https://grafana.blue.production.govuk.digital/dashboard/file/sidekiq.json?refresh=1m&orgId=1&var-Application=email-alert-api&var-Queues=All&from=now-3h&to=now -[technical dash]: https://grafana.blue.production.govuk.digital/dashboard/file/email_alert_api_technical.json -[Sentry]: https://sentry.io/organizations/govuk/issues/?project=202220&statsPeriod=12h -[Kibana]: https://kibana.logit.io/s/2dd89c13-a0ed-4743-9440-825e2e52329e/app/kibana#/discover?_g=(refreshInterval:(display:Off,pause:!f,value:0),time:(from:now-1h,mode:quick,to:now))&_a=(columns:!('@message',host),index:'*-*',interval:auto,query:(query_string:(query:'@type:%20sidekiq%20AND%20application:%20email-alert-api')),sort:!('@timestamp',desc)) -[machine metrics]: https://grafana.blue.production.govuk.digital/dashboard/file/machine.json?refresh=1m&orgId=1&var-hostname=email_alert_api*&var-cpmetrics=cpu-system&var-cpmetrics=cpu-user&var-filesystem=All&var-disk=All&var-tcpconnslocal=All&var-tcpconnsremote=All -[postgres dash]: https://grafana.production.govuk.digital/dashboard/file/aws-rds.json?orgId=1&var-region=eu-west-1&var-dbinstanceidentifier=email-alert-api-postgres&from=now-3h&to=now diff --git a/source/manual/alerts/email-alert-api-unprocessed-work.html.md b/source/manual/alerts/email-alert-api-unprocessed-work.html.md deleted file mode 100644 index d394389bfb..0000000000 --- a/source/manual/alerts/email-alert-api-unprocessed-work.html.md +++ /dev/null @@ -1,64 +0,0 @@ ---- -owner_slack: "#govuk-2ndline-tech" -title: 'Email Alert API: Unprocessed work' -section: Icinga alerts -subsection: Email alerts -layout: manual_layout -parent: "/manual.html" ---- - -This alert indicates that Email Alert API has work that has not been processed in the generous amount of time we expect it to have been. Which alert you see depends on the type of work. - -* **[`unprocessed content changes`](https://github.com/alphagov/email-alert-api/blob/main/app/workers/process_content_change_worker.rb)**. - - * This means there is a signficiant delay in generating emails for subscribers with "immediate" frequency subscriptions in response to [a change in some content] on GOV.UK. - -* **[`unprocessed messages`](https://github.com/alphagov/email-alert-api/blob/main/app/workers/process_message_worker.rb)**. - - * This means there is a significant delay in generating emails for subscribers with "immediate" frequency subscriptions in response to [a custom message]. - -* **`incomplete digest runs`**. - - * This could be due to a failure in any of three workers: - - * [\[Daily/Weekly\]DigestInitiatorWorker](https://github.com/alphagov/email-alert-api/blob/a656389b1abdd46226ca37c1682c318f1c2eafee/app/workers/daily_digest_initiator_worker.rb) generates a DigestRunSubscriber work item for each subscriber. - * [DigestEmailGenerationWorker](https://github.com/alphagov/email-alert-api/blob/a656389b1abdd46226ca37c1682c318f1c2eafee/app/workers/digest_email_generation_worker.rb) does the work of generating the digest email for a specific subscriber. - * [DigestRunCompletionMarkerWorker](https://github.com/alphagov/email-alert-api/blob/a656389b1abdd46226ca37c1682c318f1c2eafee/app/workers/digest_run_completion_marker_worker.rb) periodically scans all the work items to see if the run is complete. - -Each of the alerts is based on custom metrics that we collect using [a periodic job](https://github.com/alphagov/email-alert-api/blob/a656389b1abdd46226ca37c1682c318f1c2eafee/app/workers/metrics_collection_worker.rb). The metric will be something like "amount of unprocessed work older than X amount of time" ([example](https://github.com/alphagov/email-alert-api/blob/a656389b1abdd46226ca37c1682c318f1c2eafee/app/workers/metrics_collection_worker/content_change_exporter.rb#L16)). - -## Automatic recovery - -Sometimes we lose work due to [a flaw with the Sidekiq queueing system](https://github.com/mperham/sidekiq/wiki/Problems-and-Troubleshooting#my-sidekiq-process-is-crashing-what-do-i-do). In order to cope with this scenario, a [RecoverLostJobsWorker] runs every 30 minutes, and will try to requeue work that has not been processed [within an hour](https://github.com/alphagov/email-alert-api/blob/2f3931ac1ca25fe8c79b2405af98d1de55e1d47b/app/workers/recover_lost_jobs_worker/unprocessed_check.rb#L13). If work is being repeatedly lost, the alert will fire and you'll need to investigate manually. - -## Manual steps to fix - -Things to check: - -* Check [Sentry] for errors. - -* Check the [Sidekiq dashboard] for worker failures. - -* Check [Kibana] for errors - use ```@fields.worker: ``` for the query. - -* Check the [Email Alert API Technical dashboard] for performance issues. - -If all else fails, you can try running the work manually from a console. [The automatic recovery worker](https://github.com/alphagov/email-alert-api/blob/2f3931ac1ca25fe8c79b2405af98d1de55e1d47b/app/workers/recover_lost_jobs_worker/unprocessed_check.rb#L13) code is a good example of how to do this, but you will need to use `new.perform` instead of `perform_async`. - -> A digest run may be "complete" - all work items generated, all work items processed - but not marked as such. In this case, you will need to use slightly different commands to investigate the incomplete run: -> -> ```ruby -> # find which digests are "incomplete" -> DigestRun.where("created_at < ?", 1.hour.ago).where(completed_at: nil) -> -> # try manually marking it as complete -> DigestRunCompletionMarkerWorker.new.perform -> ``` - -[Sentry]: https://sentry.io/organizations/govuk/issues/?project=202220&statsPeriod=6h -[a custom message]: https://github.com/alphagov/email-alert-api/blob/main/docs/api.md#post-messages -[a change in some content]: https://github.com/alphagov/email-alert-api/blob/main/docs/api.md#post-content-changes -[Kibana]: https://kibana.logit.io/s/2dd89c13-a0ed-4743-9440-825e2e52329e/app/kibana#/discover?_g=(refreshInterval:(display:Off,pause:!f,value:0),time:(from:now-1h,mode:quick,to:now))&_a=(columns:!('@message',host),index:'*-*',interval:auto,query:(query_string:(query:'@type:%20sidekiq%20AND%20application:%20email-alert-api%20AND%20@fields.worker:%20ProcessContentChangeWorker')),sort:!('@timestamp',desc)) -[RecoverLostJobsWorker]: https://github.com/alphagov/email-alert-api/blob/main/app/workers/recover_lost_jobs_worker.rb -[Sidekiq dashboard]: https://grafana.production.govuk.digital/dashboard/file/sidekiq.json?refresh=1m&orgId=1&var-Application=email-alert-api&var-Queues=All&from=now-3h&to=now -[Email Alert API Technical dashboard]: https://grafana.production.govuk.digital/dashboard/file/email_alert_api_technical.json?refresh=1m&orgId=1 diff --git a/source/manual/alerts/fastly-error-rate.html.md b/source/manual/alerts/fastly-error-rate.html.md deleted file mode 100644 index ff5e2255a0..0000000000 --- a/source/manual/alerts/fastly-error-rate.html.md +++ /dev/null @@ -1,58 +0,0 @@ ---- -owner_slack: "#govuk-2ndline-tech" -title: Fastly error rate for GOV.UK -section: Icinga alerts -layout: manual_layout -parent: "/manual.html" ---- - -We get response code reporting from Fastly (with a 15 minute delay). It -averages out the last 15 minutes worth of 5xx errors. This is a useful -supplementary metric to highlight low-level errors that occur over a longer -period of time. - -The alert appears on `monitoring-1.management`. - -It is possible to [query the CDN logs using AWS Athena][query-cdn-logs]. -The following query gets a count of URLs where a 5xx error has been served -between the given timestamps: - -``` -SELECT url, status, COUNT(*) AS count -FROM fastly_logs.govuk_www -WHERE status >= 500 AND status <= 599 -AND request_received >= TIMESTAMP '2018-11-26 11:00' -AND request_received < TIMESTAMP '2018-11-26 11:30' -AND date = 26 AND month = 11 AND year = 2018 -GROUP BY url, status -ORDER BY count DESC -``` - -It is also possible to examine the raw Fastly CDN logs: - -- `ssh monitoring-1.management.production` -- `cd /var/log/cdn` to access log files - -## `Unknown` alert - -The alert appears on `monitoring-1.management`. Collectd uses the Fastly -API to get statistics which it pushes to Graphite. If the alert is unknown, -collectd likely cannot talk to Fastly so restart collectd. - -```shell -$ sudo service collectd restart -``` - -To prove collectd is the problem, use this query in Kibana: - -```rb -syslog_hostname:monitoring-1.management AND syslog_program:collectd -``` - -You will see many reports similar to: - -``` -cdn_fastly plugin: Failed to query service -``` - -[query-cdn-logs]: /manual/query-cdn-logs.html diff --git a/source/manual/alerts/nginx-requests-too-low.html.md b/source/manual/alerts/nginx-requests-too-low.html.md deleted file mode 100644 index d566361b7f..0000000000 --- a/source/manual/alerts/nginx-requests-too-low.html.md +++ /dev/null @@ -1,27 +0,0 @@ ---- -owner_slack: "#govuk-2ndline-tech" -title: Nginx requests too low -parent: "/manual.html" -layout: manual_layout -section: Icinga alerts ---- - -We monitor the number of requests reaching our Nginx servers. We expect that -there will be a minimum number of requests occurring and a check will alert if -this falls below a threshold. - -[View the Nginx logs in Kibana][nginx_logs] or view the impacted application dashboard on -Grafana. - -There are a few things to check when this occurs: - -- On Staging and Integration, this alert may appear while production - data is being copied to the environment. This is because production - traffic replay is paused during the copying job. -- It could be a genuine low number of requests. The threshold is - configurable in hieradata so we can tweak for environments where we - expect to see lower traffic levels. -- It could be indicative of a bigger problem. A previous cause for this has - been misconfigured firewall configs on our vSE load balancer. - -[nginx_logs]: /manual/kibana.html#nginx-logs diff --git a/source/manual/alerts/pingdom-homepage-check.html.md b/source/manual/alerts/pingdom-homepage-check.html.md index dfc797866c..1c515c1d4e 100644 --- a/source/manual/alerts/pingdom-homepage-check.html.md +++ b/source/manual/alerts/pingdom-homepage-check.html.md @@ -3,7 +3,7 @@ owner_slack: "#govuk-2ndline-tech" title: Pingdom check for homepage failing parent: "/manual.html" layout: manual_layout -section: Icinga alerts +section: Pagerduty alerts --- [Pingdom][] monitors externally (from ~10 locations in Europe and America) diff --git a/source/manual/alerts/pingdom-search-check.html.md b/source/manual/alerts/pingdom-search-check.html.md deleted file mode 100644 index 400c6caf34..0000000000 --- a/source/manual/alerts/pingdom-search-check.html.md +++ /dev/null @@ -1,18 +0,0 @@ ---- -owner_slack: "#govuk-2ndline-tech" -title: Pingdom search check -parent: "/manual.html" -layout: manual_layout -section: Icinga alerts ---- - -If [Pingdom](/manual/pingdom.html) can't retrieve the search results page, it means that while GOV.UK -may be available, it is not possible to retrieve dynamic content. Assuming that -the homepage check has not failed, the CDN is probably OK. It is possible for -our main provider (AWS) to be down and for us to serve static content -from a secondary mirror at a second supplier (AWS/GCP). - -This is not as critical a problem as you might assume, because a large amount -of traffic from external searches goes directly to static content and can be -served from the mirror. Debug as normal by walking through the stack backwards -from the user's perspective to find out where the failure is. diff --git a/source/manual/alerts/publisher-scheduled-publishing.html.md b/source/manual/alerts/publisher-scheduled-publishing.html.md deleted file mode 100644 index 7c45fce3a2..0000000000 --- a/source/manual/alerts/publisher-scheduled-publishing.html.md +++ /dev/null @@ -1,15 +0,0 @@ ---- -owner_slack: "#govuk-whitehall-experience-tech" -title: More items scheduled for publication than in queue for publisher -parent: "/manual.html" -layout: manual_layout -section: Icinga alerts ---- - -This alert means that the number of editions in the publisher database -which are scheduled to be published in the future is different from -the number currently in the Sidekiq queue. - -This can happen in Staging and Integration as a result of the data -sync from Production. Run Publisher's `editions:requeue_scheduled_for_publishing` -rake task to re-queue all scheduled editions in Integration and Staging. diff --git a/source/manual/alerts/search-api-app-healthcheck-not-ok.html.md b/source/manual/alerts/search-api-app-healthcheck-not-ok.html.md deleted file mode 100644 index 780ab1bbe4..0000000000 --- a/source/manual/alerts/search-api-app-healthcheck-not-ok.html.md +++ /dev/null @@ -1,30 +0,0 @@ ---- -owner_slack: "#govuk-2ndline-tech" -title: Search API app healthcheck not ok -parent: "/manual.html" -layout: manual_layout -section: Icinga alerts ---- - -See also: [how healthcheck alerts work on GOV.UK](app-healthcheck-not-ok.html) - -## Elasticsearch connectivity is not OK - -The Search API uses elasticsearch as an underlying data store and search -engine. - -If the application cannot connect to the elasticsearch cluster, -this will prevent end users performing searches. Search API has [a custom healthcheck](https://github.com/alphagov/search-api/blob/05df032d2791769837d2b23cb8fd08a2bc474456/lib/rummager/app.rb#L311-L315) to check for this scenario. - -Note: We use a managed elasticsearch, [Amazon Elasticsearch Service][aws-elasticsearch], rather than running our own. - -### How do I investigate this? - -Find out why the Search API can't connect to elasticsearch. - -- Look at the Search API logs -- Look at the [elasticsearch cluster health][cluster-health] -- Check the status of the Elasticsearch cluster in the AWS console - -[cluster-health]: /manual/alerts/elasticsearch-cluster-health.html -[aws-elasticsearch]: https://aws.amazon.com/elasticsearch-service/ diff --git a/source/manual/alerts/search-api-learn-to-rank.html.md b/source/manual/alerts/search-api-learn-to-rank.html.md deleted file mode 100644 index 112882d1bd..0000000000 --- a/source/manual/alerts/search-api-learn-to-rank.html.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -owner_slack: "#govuk-2ndline-tech" -title: Train and deploy LTR model for Search API -parent: "/manual.html" -layout: manual_layout -section: Icinga alerts ---- - -[Search API](/repos/search-api.html) uses a machine learning tool called Learn to Rank (LTR) to improve search result relevance. This uses the TensorFlow Ranking module. - -On occassion the current ranking becomes out of date and the tool needs to be re-trained. There are [several rake tasks](/repos/search-api/learning-to-rank.html) to accomplish this. diff --git a/source/manual/alerts/search-api-queue-latency.html.md b/source/manual/alerts/search-api-queue-latency.html.md deleted file mode 100644 index b97591e866..0000000000 --- a/source/manual/alerts/search-api-queue-latency.html.md +++ /dev/null @@ -1,30 +0,0 @@ ---- -owner_slack: "#govuk-2ndline-tech" -title: High Search API Sidekiq queue latency -parent: "/manual.html" -layout: manual_layout -section: Icinga alerts ---- - -[Search API](/repos/search-api.html) uses Sidekiq to offload indexing work. - -This alert triggers when there are jobs in the Sidekiq queue that are waiting -too long to be processed. This could result in documents being published not appearing in search results. - -## Investigating the issue - -The issue could be caused by a temporary spike in publishing activity, or -something being wrong with Search API. - -You can check the Sidekiq Grafana dashboard for Search API -([integration][search-api-grafana-integration], -[staging][search-api-grafana-staging], -[production][search-api-grafana-production]). Take a look at the "Retry set -size" - this could mean that jobs are failing. You can then look at -[Sentry][sentry] or [Sidekiq web][sidekiq-web] to see what's going on. - -[search-api-grafana-production]: https://grafana.production.govuk.digital/dashboard/file/sidekiq.json?refresh=1m&orgId=1&var-Application=search-api&var-Queues=All -[search-api-grafana-staging]: https://grafana.staging.govuk.digital/dashboard/file/sidekiq.json?refresh=1m&orgId=1&var-Application=search-api&var-Queues=All -[search-api-grafana-integration]: https://grafana.integration.govuk.digital/dashboard/file/sidekiq.json?refresh=1m&orgId=1&var-Application=search-api&var-Queues=All -[sentry]: /manual/error-reporting.html -[sidekiq-web]: /manual/sidekiq.html#sidekiq-web-aka-sidekiq-monitoring diff --git a/source/manual/alerts/search-reindex-failed.html.md.erb b/source/manual/alerts/search-reindex-failed.html.md.erb deleted file mode 100644 index 5d3ab93e9d..0000000000 --- a/source/manual/alerts/search-reindex-failed.html.md.erb +++ /dev/null @@ -1,23 +0,0 @@ ---- -owner_slack: "#govuk-2ndline-tech" -title: Search reindex failed -parent: "/manual.html" -layout: manual_layout -section: Icinga alerts ---- - -The reindex task is run weekly on a Monday at 9pm on integration. It -[reindexes][reindexing] every Elasticsearch index used by [search-api][]. This is -to ensure the process works as expected when we need to run it in production. -This task is manually run in production by the development team after they have -made and changes to the Elasticsearch schema. - -If this process fails then please escalate to the [team currently responsible for search][search-api] -for further investigation. - -This task can be run in Jenkins: - -<%= RunRakeTask.links("search-api", "search:migrate_schema SEARCH_INDEX=index_alias_name") %> - -[reindexing]: /manual/reindex-elasticsearch.html -[search-api]: /repos/search-api.html diff --git a/source/manual/alerts/search-reranker-healthcheck-not-ok.html.md b/source/manual/alerts/search-reranker-healthcheck-not-ok.html.md deleted file mode 100644 index f39ade20cd..0000000000 --- a/source/manual/alerts/search-reranker-healthcheck-not-ok.html.md +++ /dev/null @@ -1,29 +0,0 @@ ---- -owner_slack: "#govuk-2ndline-tech" -title: Search reranker healthcheck not ok -parent: "/manual.html" -layout: manual_layout -section: Icinga alerts ---- - -The Search API uses machine learning to rank search results based on -analytics data. If this alert fires, something has gone wrong with -that process and we're serving results as they were ordered by -elasticsearch. - -Unlike the other healthcheck failures, this does not mean that Search -API is serving errors. Only that it is serving potentially worse -results. - -The machine learning model is hosted in [Amazon SageMaker][aws-sagemaker]. - -### How do I investigate this? - -Find out why the Search API can't connect to SageMaker. - -- Look at the error message in the healthcheck response -- Look at the Search API logs -- Check the status of the [SageMaker endpoint in the AWS console][sagemaker-endpoint] - -[aws-sagemaker]: https://aws.amazon.com/sagemaker/ -[sagemaker-endpoint]: https://eu-west-1.console.aws.amazon.com/sagemaker/home?region=eu-west-1#/endpoints/govuk-production-search-ltr-endpoint diff --git a/source/manual/alerts/content-data-api-app-healthcheck-not-ok.html.md.erb b/source/manual/content-data-api-app-healthcheck-not-ok.html.md.erb similarity index 98% rename from source/manual/alerts/content-data-api-app-healthcheck-not-ok.html.md.erb rename to source/manual/content-data-api-app-healthcheck-not-ok.html.md.erb index d18d46ddce..00c9a8442c 100644 --- a/source/manual/alerts/content-data-api-app-healthcheck-not-ok.html.md.erb +++ b/source/manual/content-data-api-app-healthcheck-not-ok.html.md.erb @@ -1,13 +1,11 @@ --- owner_slack: "#govuk-platform-security-reliability-team" title: content-data-api app healthcheck not ok -section: Icinga alerts +section: Content Data layout: manual_layout parent: "/manual.html" --- -See also: [how healthcheck alerts work on GOV.UK](app-healthcheck-not-ok.html) - If there is a health check error showing for Content Data API, click on the alert to find out more details about what’s wrong. diff --git a/source/manual/content-data-architecture.html.md b/source/manual/content-data-architecture.html.md index 662137ee1b..344316d953 100644 --- a/source/manual/content-data-architecture.html.md +++ b/source/manual/content-data-architecture.html.md @@ -97,7 +97,7 @@ Content Data API runs an [ETL (Extract, Transform, Load)](https://en.wikipedia.o ![Overview of the ETL Processor](/images/content-data-architecture-etl-processor.png) -For more information see the [What is the ETL process](/manual/alerts/content-data-api-app-healthcheck-not-ok.html#what-is-the-etl-process) developer doc. +For more information see the [What is the ETL process](/manual/content-data-api-app-healthcheck-not-ok.html#what-is-the-etl-process) developer doc. #### Streams Processor diff --git a/source/manual/alerts/elasticsearch-cluster-health.html.md b/source/manual/elasticsearch-cluster-health.html.md similarity index 81% rename from source/manual/alerts/elasticsearch-cluster-health.html.md rename to source/manual/elasticsearch-cluster-health.html.md index 148a20ce7e..9a4fae2b10 100644 --- a/source/manual/alerts/elasticsearch-cluster-health.html.md +++ b/source/manual/elasticsearch-cluster-health.html.md @@ -3,7 +3,7 @@ owner_slack: "#govuk-2ndline-tech" title: Elasticsearch cluster health parent: "/manual.html" layout: manual_layout -section: Icinga alerts +section: Infrastructure --- Elasticsearch reports cluster health as one of three possible states, based on @@ -26,13 +26,6 @@ can be found in the Elasticsearch documentation. Make sure you understand the consequences of the problem before jumping to a solution. -Icinga uses the `check_elasticsearch_aws` check from [nagios-plugins][] to -monitor the health of the AWS managed Elasticsearch cluster. This plugin uses -various endpoints of the Elasticsearch API, but also extrapolates additional -information to help you diagnose any problems. - -[nagios-plugins]: https://github.com/alphagov/nagios-plugins/ - ### Investigating problems #### View a live dashboard @@ -69,17 +62,7 @@ Response JSON from the `/_cluster/health` endpoint looks like: ``` A tunnel to Elasticsearch in a specific environment (e.g staging) can be created -using the following: - -``` -gds govuk connect ssh --environment staging search -- -N -L 9200:elasticsearch6:80 -``` - -Elasticsearch will then be available at . - -#### Logging - -Access to logs is detailed in the [logging documentation](/manual/logging.html#elasticsearch). +in a similar manner to [accessing an OpenSearch Dashboard](/manual/manage-opensearch-on-aws.html) ### Fixing issues with the cluster diff --git a/source/manual/help-with-publishing-content.html.md b/source/manual/help-with-publishing-content.html.md index 2a9937db1f..3515f58a64 100644 --- a/source/manual/help-with-publishing-content.html.md +++ b/source/manual/help-with-publishing-content.html.md @@ -39,7 +39,7 @@ necessary to help them to ensure it goes out as smoothly as possible. Safety Alerts go out correctly. Although an alert won't be triggered for other kinds of documents, the [guidance will still apply](alerts/email-alerts-travel-medical.html). -- If [a scheduled publication hasn't gone live](alerts/whitehall-scheduled-publishing.html), +- If [a scheduled publication hasn't gone live](whitehall-scheduled-publishing.html), start here: [if documents aren't live after being published][live]. If it looks as though the content was never published from Whitehall, there is a Rake task available which will publish overdue diff --git a/source/manual/on-call.html.md b/source/manual/on-call.html.md index 902832d5ee..bc3dc3dc3a 100644 --- a/source/manual/on-call.html.md +++ b/source/manual/on-call.html.md @@ -60,8 +60,8 @@ push notification). There are 2 ways that this might contact you: Any Icinga checks that use `govuk_urgent_priority` will cause PagerDuty to be notified: - [Travel advice emails not going out](/manual/alerts/email-alerts-travel-medical.html) -- [Overdue publications in Whitehall](/manual/alerts/whitehall-scheduled-publishing.html#overdue-publications-in-whitehall) -- [Scheduled publications in Whitehall not queued](/manual/alerts/whitehall-scheduled-publishing.html#scheduled-publications-in-whitehall-not-queued) +- [Overdue publications in Whitehall](/manual/whitehall-scheduled-publishing.html#overdue-publications-in-whitehall) +- [Scheduled publications in Whitehall not queued](/manual/whitehall-scheduled-publishing.html#scheduled-publications-in-whitehall-not-queued) - [High nginx 5xx rate for www-origin on cache machines](/manual/alerts/high-nginx-5xx-rate.html) You can get the most up to date list of these by searching the Puppet repo for [govuk_urgent_priority](https://github.com/alphagov/govuk-puppet/search?q=govuk_urgent_priority). diff --git a/source/manual/alerts/publisher-unprocessed-fact-check-emails.html.md b/source/manual/publisher-unprocessed-fact-check-emails.html.md similarity index 90% rename from source/manual/alerts/publisher-unprocessed-fact-check-emails.html.md rename to source/manual/publisher-unprocessed-fact-check-emails.html.md index bec6d4c511..7eb7fda343 100644 --- a/source/manual/alerts/publisher-unprocessed-fact-check-emails.html.md +++ b/source/manual/publisher-unprocessed-fact-check-emails.html.md @@ -1,20 +1,19 @@ --- owner_slack: "#govuk-whitehall-experience-tech" title: "Publisher: Unprocessed fact-check emails" -section: Icinga alerts -subsection: Email alerts +section: Publishing layout: manual_layout parent: "/manual.html" --- -As part of the [Publisher fact checking process], this alert appears if emails +We send emails as part of the [Publisher fact checking process]. Sometimes emails may have arrived in the inbox but weren't able to be processed. This is usually because they're missing the identification for the edition they relate to (which is currently stored in the subject line). [Publisher fact checking process]: https://github.com/alphagov/publisher/blob/main/docs/fact-checking.md -## Dealing with the alert +## Dealing with the issue ### Log in to the inbox @@ -31,7 +30,7 @@ The current email addresses used for the fact checking process are: ### Retrieve the mailbox credentials from the app ```sh -echo 'Publisher::Application.mail_fetcher_config' | k exec -i deploy/publisher -- rails c +k exec deploy/publisher -- rails runner 'pp Publisher::Application.mail_fetcher_config' ``` ### Investigate the unprocessed emails diff --git a/source/manual/alerts/whitehall-scheduled-publishing.html.md.erb b/source/manual/whitehall-scheduled-publishing.html.md.erb similarity index 81% rename from source/manual/alerts/whitehall-scheduled-publishing.html.md.erb rename to source/manual/whitehall-scheduled-publishing.html.md.erb index 47e6e618c2..436f023e00 100644 --- a/source/manual/alerts/whitehall-scheduled-publishing.html.md.erb +++ b/source/manual/whitehall-scheduled-publishing.html.md.erb @@ -3,12 +3,12 @@ owner_slack: "#govuk-whitehall-experience-tech" title: Whitehall scheduled publishing parent: "/manual.html" layout: manual_layout -section: Icinga alerts +section: Publishing --- ## Overdue publications in Whitehall -This alert means that there are scheduled editions which have passed their +Sometimes there may be scheduled editions which have passed their publication due date but haven't been published by the scheduled publishing workers. Scheduled publishing is performed by Sidekiq workers picking up jobs from a scheduled queue. @@ -37,11 +37,11 @@ following: If the above rake tasks aren't working, it could be because the database was recently restored, perhaps due to the data sync. In that case, you can try -running the following Rake task on a `whitehall_backend` machine: +running the following Rake task: <%= RunRakeTask.links("whitehall-admin", "publishing:scheduled:requeue_all_jobs") %> -Due to the overnight [data anonymisation process](https://github.com/alphagov/whitehall/blob/7b5c5a086b89cb62ffba62b152a0a8dcfc10c8e6/script/scrub-database) you may notice +Due to the overnight [data anonymisation process](https://github.com/alphagov/govuk-helm-charts/blob/20a96dd5/charts/db-backup/scripts/whitehall.sql) you may notice that some of the pending documents have one or more edition that is in a `scheduled` state, is `access_limited`, and may have one or more attachments with the filename `redacted.pdf`. @@ -54,8 +54,8 @@ the same documents/editions. ## Scheduled publications in Whitehall not queued -This alert means that the number of editions in the database which are -scheduled to be published in the future is different from the number currently +It is also possible for the number of editions in the database which are +scheduled to be published in the future to be different from the number currently in the queue. Run the `publishing:scheduled:requeue_all_jobs` Rake task to requeue all