From 37865c7f06bf9af90a3b9c2aca894fe9e60f9fb9 Mon Sep 17 00:00:00 2001 From: Ike Saunders Date: Tue, 12 Nov 2024 17:25:34 -0500 Subject: [PATCH] =?UTF-8?q?=F0=9F=8E=89=20add=20cloudflare=20images=20sync?= =?UTF-8?q?=20script?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Makefile | 4 + .../cloudflareImagesSync.ts | 458 ++++++++++++++++++ .../cloudflareImagesSync/invalidImages.json | 105 ++++ devTools/cloudflareImagesSync/tsconfig.json | 15 + package.json | 5 +- settings/serverSettings.ts | 6 + yarn.lock | 31 ++ 7 files changed, 623 insertions(+), 1 deletion(-) create mode 100644 devTools/cloudflareImagesSync/cloudflareImagesSync.ts create mode 100644 devTools/cloudflareImagesSync/invalidImages.json create mode 100644 devTools/cloudflareImagesSync/tsconfig.json diff --git a/Makefile b/Makefile index faa6f072e7..67ab147bbf 100644 --- a/Makefile +++ b/Makefile @@ -154,6 +154,10 @@ sync-images: sync-images.preflight-check @echo '==> Syncing images to R2' @. ./.env && ./devTools/docker/sync-s3-images.sh +sync-cloudflare-images: + @echo '==> Syncing images to Cloudflare' + @yarn syncCloudflareImages + refresh.full: refresh refresh.pageviews sync-images @echo '==> Full refresh completed' @make bake-images diff --git a/devTools/cloudflareImagesSync/cloudflareImagesSync.ts b/devTools/cloudflareImagesSync/cloudflareImagesSync.ts new file mode 100644 index 0000000000..be8408f46c --- /dev/null +++ b/devTools/cloudflareImagesSync/cloudflareImagesSync.ts @@ -0,0 +1,458 @@ +const is = require("image-size") +import * as readline from "readline" +import pMap from "p-map" +import path from "path" +import fs from "fs/promises" +import { DbEnrichedImage } from "@ourworldindata/types" +import * as db from "../../db/db.js" +import { + CLOUDFLARE_IMAGES_ACCOUNT_ID, + CLOUDFLARE_IMAGES_API_KEY, + IMAGE_HOSTING_R2_CDN_URL, +} from "../../settings/serverSettings.js" +import { excludeNullish, groupBy, keyBy } from "@ourworldindata/utils" + +type CloudflareImageDirectory = Record + +enum InvalidImageReason { + TooLarge = "TooLarge", + InvalidFormat = "InvalidFormat", + InvalidDimensions = "InvalidDimensions", + TooManyMegapixels = "TooManyMegapixels", + InvalidMetadata = "InvalidMetadata", + UnknownError = "UnknownError", +} + +type ImageValidationObject = { + filename: string + reason: InvalidImageReason + extra?: any +} + +function stringifyImageMetadata(image: DbEnrichedImage) { + return JSON.stringify({ + filename: image.filename, + }) +} + +/** + * Make sure that each database cloudflareId corresponds to a valid image in the Cloudflare Images directory + */ +async function validateDirectory( + trx: db.KnexReadWriteTransaction, + directory: CloudflareImageDirectory +): Promise<{ isValid: boolean; invalidImages: string[] }> { + const imagesWithIds = await db.knexRaw<{ + filename: string + cloudflareId: string + }>( + trx, + `-- sql + SELECT filename, cloudflareId FROM images WHERE cloudflareId IS NOT NULL` + ) + const imagesSharingCloudflareIds = await db + .knexRaw<{ + cloudflareId: string + count: number + filenames: string + }>( + trx, + `-- sql + SELECT + cloudflareId, + COUNT(*) as count, + JSON_ARRAYAGG( + filename + ) as filenames + FROM images + WHERE cloudflareId IS NOT NULL + GROUP BY cloudflareId + HAVING count > 1` + ) + .then((results) => + results.map((result) => ({ + cloudflareId: result.cloudflareId, + count: result.count, + filenames: JSON.parse(result.filenames) as string[], + })) + ) + .then((results) => keyBy(results, "cloudflareId")) + + const invalidImages: string[] = [] + for (const image of imagesWithIds) { + if (!directory[image.filename]) { + // If an identical image was uploaded with multiple filenames, subsequent copies will use the same cloudflareId as the first + // so let's check if this is a case of that + const imagesSharingCloudflareId = + imagesSharingCloudflareIds[image.cloudflareId] + if (imagesSharingCloudflareId) { + const filenames = imagesSharingCloudflareId.filenames + if (filenames.includes(image.filename)) { + console.log( + `Image with filename "${image.filename}" has a cloudflareId that is shared with other images.` + ) + continue + } + } + console.log( + `Image with filename "${image.filename}" has a cloudflareId that is not in the Cloudflare Images directory.` + ) + invalidImages.push(image.filename) + } + } + return { + isValid: invalidImages.length === 0, + invalidImages, + } +} + +async function purgeRecords(trx: db.KnexReadWriteTransaction) { + await new Promise((resolve) => { + const readlineInterface = readline.createInterface({ + input: process.stdin, + output: process.stdout, + }) + + readlineInterface.question( + "Are you sure you want to delete ALL images from Cloudflare Images? (y/n) ", + (answer) => { + if (answer.toLowerCase() === "y") { + resolve() + } else { + console.log("Aborting.") + process.exit(0) + } + readlineInterface.close() + } + ) + }) + + const directory = await getCloudflareImageDirectory() + console.log("Deleting all images from Cloudflare Images...") + await pMap( + Object.values(directory), + async (image) => { + console.log("Deleting image:", image.filename) + try { + await fetch( + `https://api.cloudflare.com/client/v4/accounts/${CLOUDFLARE_IMAGES_ACCOUNT_ID}/images/v1/${image.id}`, + { + method: "DELETE", + headers: { + Authorization: `Bearer ${CLOUDFLARE_IMAGES_API_KEY}`, + }, + } + ) + } catch (e) { + console.error(e) + } + }, + { concurrency: 10 } + ) + console.log("Finished") + + await new Promise((resolve) => { + const readlineInterface = readline.createInterface({ + input: process.stdin, + output: process.stdout, + }) + + readlineInterface.question( + "Would you also like to set all cloudflareIds to NULL in the DB? (y/n) ", + (answer) => { + if (answer.toLowerCase() === "y") { + resolve() + } else { + console.log("Aborting.") + process.exit(0) + } + readlineInterface.close() + } + ) + }) + console.log("May God have mercy on your soul.") + + await db.knexRaw( + trx, + `-- sql + UPDATE images + SET cloudflareId = NULL` + ) + console.log("All cloudflareIds set to NULL in the DB.") +} + +/** + * Cloudflare has a width/height of 12000px, metadata of 1024B, 100megapixels, and a 10MB filesize limit + */ +function validateImage( + imageBuffer: Buffer, + metadata: string +): InvalidImageReason | null { + const imageSize = is(imageBuffer) + if (!imageSize) { + return InvalidImageReason.InvalidFormat + } + + if (imageSize.width > 12000 || imageSize.height > 12000) { + return InvalidImageReason.InvalidDimensions + } + + if (imageSize.width * imageSize.height > 100 * 1000000) { + return InvalidImageReason.TooManyMegapixels + } + + if (imageBuffer.byteLength > 10 * 1024 * 1024) { + return InvalidImageReason.TooLarge + } + + if (Buffer.byteLength(metadata, "utf8") > 1024) { + return InvalidImageReason.InvalidMetadata + } + + return null +} + +async function checkIfAlreadyUploadedToCloudflareImages( + filename: string, + cloudflareImagesDirectory: CloudflareImageDirectory +): Promise { + if (cloudflareImagesDirectory[filename]) { + console.log( + `Image with filename "${filename}" has already uploaded to Cloudflare Images.` + ) + return true + } + return false +} + +async function checkIfAlreadyTrackedInDB( + trx: db.KnexReadWriteTransaction, + filename: string +) { + console.log("Checking to see if the DB has the Cloudflare ID...") + const cloudflareId = await trx + .raw<{ cloudflareId: string }[][]>( + `-- sql + SELECT cloudflareId FROM images WHERE filename = ? + `, + [filename] + ) + .then((res) => res[0][0]?.cloudflareId) + if (!cloudflareId) { + console.log("No Cloudflare ID found in the DB.") + return false + } else { + console.log(`Cloudflare ID "${cloudflareId}" exists in the DB.`) + return true + } +} + +async function updateDbWithCloudflareId( + trx: db.KnexReadWriteTransaction, + filename: string, + cloudflareId: string +) { + console.log("Updating the DB with the Cloudflare ID...") + await trx.raw( + `-- sql + UPDATE images + SET cloudflareId = ? + WHERE filename = ?`, + [cloudflareId, filename] + ) +} + +async function uploadImageToCloudflareImages( + trx: db.KnexReadWriteTransaction, + image: DbEnrichedImage, + invalidImages: ImageValidationObject[], + cloudflareImagesDirectory: CloudflareImageDirectory +) { + const filename = image.filename + + /** + * If the image is already tracked in the DB, we don't need to do anything. + * If the image is already uploaded to Cloudflare Images, we check if we need to update the DB with the cloudflareId. + * It's possible the image has already been uploaded but is saved under a different filename, + * in which case we go through the normal process of uploading the image, + * which is a no-op for Cloudflare, but will give us the right ID to update the DB with. + */ + const alreadyTracked = await checkIfAlreadyTrackedInDB(trx, filename) + const alreadyUploaded = await checkIfAlreadyUploadedToCloudflareImages( + filename, + cloudflareImagesDirectory + ) + if (alreadyTracked) { + return + } + if (alreadyUploaded) { + const cloudflareId = cloudflareImagesDirectory[filename].id + await updateDbWithCloudflareId(trx, filename, cloudflareId) + return + } + + const imageUrl = `${IMAGE_HOSTING_R2_CDN_URL}/production/${filename}` + console.log("Downloading image:", filename) + const imageBuffer = await fetch(imageUrl).then((res) => res.arrayBuffer()) + const metadata = stringifyImageMetadata(image) + const isInvalid = validateImage(Buffer.from(imageBuffer), metadata) + if (isInvalid) { + console.log(`Image "${filename}" is invalid: ${isInvalid}`) + invalidImages.push({ + filename, + reason: isInvalid, + }) + return + } + + const formData = new FormData() + formData.append("url", imageUrl) + formData.append("metadata", metadata) + formData.append("requireSignedURLs", "false") + + console.log("Uploading image to Cloudflare Images...") + const uploadResults = await fetch( + `https://api.cloudflare.com/client/v4/accounts/${CLOUDFLARE_IMAGES_ACCOUNT_ID}/images/v1`, + { + method: "POST", + headers: { + Authorization: `Bearer ${CLOUDFLARE_IMAGES_API_KEY}`, + }, + body: formData, + } + ).then((res) => res.json()) + + if (!uploadResults || uploadResults.errors.length) { + invalidImages.push({ + filename, + reason: InvalidImageReason.UnknownError, + extra: uploadResults.errors, + }) + return + } + + await trx.raw( + `-- sql + UPDATE images + SET cloudflareId = ? + WHERE googleId = ?`, + [uploadResults.result.id, image.googleId] + ) +} + +async function getCloudflareImageDirectory() { + console.log("Fetching Cloudflare Images directory...") + const directory = await fetch( + `https://api.cloudflare.com/client/v4/accounts/${CLOUDFLARE_IMAGES_ACCOUNT_ID}/images/v1?per_page=2000`, + { + headers: { + Authorization: `Bearer ${CLOUDFLARE_IMAGES_API_KEY}`, + }, + } + ) + .then((res) => res.json()) + .then((res) => { + console.log( + `Cloudflare Images directory fetched. ${res.result.images.length} images found.` + ) + return res.result.images + }) + .then( + (images) => + keyBy(images, (image) => + decodeURIComponent(image.filename) + ) as CloudflareImageDirectory + ) + + return directory +} + +async function fetchImagesFromDatabase(trx: db.KnexReadWriteTransaction) { + console.log("Fetching images from the database...") + return await trx + .raw( + `-- sql + SELECT * FROM images WHERE id IN ( + SELECT DISTINCT imageId FROM posts_gdocs_x_images + )` + ) + .then((res) => res.flat()) + .then(excludeNullish) + .then((images) => images.filter((image) => image && image.filename)) + .then((images) => + images.sort((a, b) => a.filename.localeCompare(b.filename)) + ) +} + +async function uploadImagesToCloudflareImages( + trx: db.KnexReadWriteTransaction, + cloudflareImagesDirectory: CloudflareImageDirectory +) { + const invalidImages: ImageValidationObject[] = [] + + const images = await fetchImagesFromDatabase(trx) + console.log(`${images.length} images fetched.`) + + await pMap( + images, + async (image) => { + console.log(`Processing image: ${image.filename}`) + try { + await uploadImageToCloudflareImages( + trx, + image, + invalidImages, + cloudflareImagesDirectory + ) + } catch (e) { + console.error(e) + invalidImages.push({ + filename: image.filename, + reason: InvalidImageReason.UnknownError, + extra: e, + }) + } + }, + { concurrency: 10 } + ) + + console.log("Finished!") + console.log( + `There were ${invalidImages.length} invalid images. See invalidImages.json for details.` + ) + + await fs.writeFile( + path.join(__dirname, "invalidImages.json"), + JSON.stringify(invalidImages, null, 2) + ) +} + +async function main() { + if (!CLOUDFLARE_IMAGES_ACCOUNT_ID || !CLOUDFLARE_IMAGES_API_KEY) { + console.error( + `Cloudflare Images credentials not set. +You need to set "CLOUDFLARE_IMAGES_ACCOUNT_ID" and "CLOUDFLARE_IMAGES_API_KEY" in your .env` + ) + return + } + + await db.knexReadWriteTransaction(async (trx) => { + // await purgeRecords(trx) + + const directory = await getCloudflareImageDirectory() + const { isValid, invalidImages } = await validateDirectory( + trx, + directory + ) + if (isValid) { + await uploadImagesToCloudflareImages(trx, directory) + } else { + console.error( + `The DB has images that do not exist in the Cloudflare Images directory. You should check those out first` + ) + console.error(invalidImages) + } + }) +} + +main().then(() => process.exit(0)) diff --git a/devTools/cloudflareImagesSync/invalidImages.json b/devTools/cloudflareImagesSync/invalidImages.json new file mode 100644 index 0000000000..a14f2b4c80 --- /dev/null +++ b/devTools/cloudflareImagesSync/invalidImages.json @@ -0,0 +1,105 @@ +[ + { + "filename": "2019-Revision-–-World-Population-Growth-1700-2100.png", + "reason": "InvalidDimensions" + }, + { + "filename": "Annual-World-Population-since-10-thousand-BCE-1.png", + "reason": "UnknownError", + "extra": {} + }, + { + "filename": "By-continent-and-decade-01.png", + "reason": "InvalidDimensions" + }, + { + "filename": "cable-tv-access-and-preference-for-a-son.png", + "reason": "InvalidDimensions" + }, + { + "filename": "calculating-risk-ratios-differences-odds.png", + "reason": "InvalidDimensions" + }, + { + "filename": "cardiovascular-diseases-types.png", + "reason": "InvalidDimensions" + }, + { + "filename": "causes-of-death-2019-full.png", + "reason": "InvalidDimensions" + }, + { + "filename": "Decade-in-which-smallpox-ceased-to-be-endemic-map.png", + "reason": "InvalidDimensions" + }, + { + "filename": "Energy-Units-01.png", + "reason": "InvalidDimensions" + }, + { + "filename": "England-death-rates.png", + "reason": "InvalidDimensions" + }, + { + "filename": "Famine-victims-since-1860s_March18.png", + "reason": "InvalidDimensions" + }, + { + "filename": "FEATURED-IMAGE-World-Population-Growth.png", + "reason": "InvalidDimensions" + }, + { + "filename": "Female-to-male-wage-ratio-01.png", + "reason": "InvalidDimensions" + }, + { + "filename": "future-yields-climate-distribution.png", + "reason": "InvalidDimensions" + }, + { + "filename": "Gender-Pay-Gap-01.png", + "reason": "InvalidDimensions" + }, + { + "filename": "Global-land-use-breakdown.png", + "reason": "InvalidDimensions" + }, + { + "filename": "Global-land-use-graphic.png", + "reason": "InvalidDimensions" + }, + { + "filename": "height-distribution.png", + "reason": "InvalidDimensions" + }, + { + "filename": "Multiple-risk-factors-full-1.png", + "reason": "UnknownError", + "extra": {} + }, + { + "filename": "Norway-death-rates.png", + "reason": "InvalidDimensions" + }, + { + "filename": "not-reading-with-comprehension.png", + "reason": "InvalidDimensions" + }, + { + "filename": "Pandemics-Timeline-Death-Tolls-OWID.png", + "reason": "InvalidDimensions" + }, + { + "filename": "period-vs-cohort-explanation.png", + "reason": "UnknownError", + "extra": {} + }, + { + "filename": "record-female-life-expectancy-since-1840-updated-2023.png", + "reason": "InvalidDimensions" + }, + { + "filename": "the-history-of-global-inequality-featured-image.png", + "reason": "InvalidDimensions" + } +] diff --git a/devTools/cloudflareImagesSync/tsconfig.json b/devTools/cloudflareImagesSync/tsconfig.json new file mode 100644 index 0000000000..208a03820d --- /dev/null +++ b/devTools/cloudflareImagesSync/tsconfig.json @@ -0,0 +1,15 @@ +{ + "extends": "../tsconfigs/tsconfig.base.json", + "compilerOptions": { + "outDir": "../../itsJustJavascript/devTools/cloudflareImagesSync", + "rootDir": "." + }, + "references": [ + { + "path": "../../db" + }, + { + "path": "../../settings" + } + ] +} diff --git a/package.json b/package.json index 59a4905380..dddaa740f2 100644 --- a/package.json +++ b/package.json @@ -41,7 +41,8 @@ "testJest": "lerna run buildTests && jest", "testSiteNavigation": "tsx --tsconfig tsconfig.tsx.json devTools/navigationTest/navigationTest.ts", "generateDbTypes": "npx @rmp135/sql-ts -c db/sql-ts/sql-ts-config.json", - "syncGraphersToR2": "tsx --tsconfig tsconfig.tsx.json devTools/syncGraphersToR2/syncGraphersToR2.ts" + "syncGraphersToR2": "tsx --tsconfig tsconfig.tsx.json devTools/syncGraphersToR2/syncGraphersToR2.ts", + "syncCloudflareImages": "tsx --tsconfig tsconfig.tsx.json devTools/cloudflareImagesSync/cloudflareImagesSync.ts" }, "dependencies": { "@algolia/autocomplete-js": "^1.17.2", @@ -189,6 +190,7 @@ "@types/fs-extra": "^11.0.1", "@types/geojson": "^7946.0.10", "@types/html-to-text": "^9.0.4", + "@types/image-size": "^0.8.0", "@types/indefinite": "^2.3.2", "@types/ini": "^4", "@types/js-cookie": "^3.0.2", @@ -232,6 +234,7 @@ "flag-icons": "^7.2.3", "http-server": "^14.1.1", "husky": "^9.0.11", + "image-size": "^1.1.1", "jest": "^29.7.0", "jest-environment-jsdom": "^29.7.0", "lerna": "^8.1.6", diff --git a/settings/serverSettings.ts b/settings/serverSettings.ts index 2530259bba..51959210ad 100644 --- a/settings/serverSettings.ts +++ b/settings/serverSettings.ts @@ -169,6 +169,12 @@ export const R2_SECRET_ACCESS_KEY: string = export const R2_REGION: string = serverSettings.R2_REGION || rcloneConfig["owid-r2"]?.region || "auto" +export const CLOUDFLARE_IMAGES_ACCOUNT_ID: string = + serverSettings.CLOUDFLARE_IMAGES_ACCOUNT_ID || "" + +export const CLOUDFLARE_IMAGES_API_KEY: string = + serverSettings.CLOUDFLARE_IMAGES_API_KEY || "" + export const GRAPHER_CONFIG_R2_BUCKET: string | undefined = serverSettings.GRAPHER_CONFIG_R2_BUCKET export const GRAPHER_CONFIG_R2_BUCKET_PATH: string | undefined = diff --git a/yarn.lock b/yarn.lock index e55dde5a4f..26847a8e4c 100644 --- a/yarn.lock +++ b/yarn.lock @@ -5270,6 +5270,15 @@ __metadata: languageName: node linkType: hard +"@types/image-size@npm:^0.8.0": + version: 0.8.0 + resolution: "@types/image-size@npm:0.8.0" + dependencies: + image-size: "npm:*" + checksum: 10/9530adc7515609f801d37d4db80f883855d7e5ba6c593f3705b6d54550438a97822937d416bc27e09237b2c610e692cce3cf59ff3105d9c4bb8c91c13ba269b6 + languageName: node + linkType: hard + "@types/indefinite@npm:^2.3.2": version: 2.3.2 resolution: "@types/indefinite@npm:2.3.2" @@ -11082,6 +11091,7 @@ __metadata: "@types/fs-extra": "npm:^11.0.1" "@types/geojson": "npm:^7946.0.10" "@types/html-to-text": "npm:^9.0.4" + "@types/image-size": "npm:^0.8.0" "@types/indefinite": "npm:^2.3.2" "@types/ini": "npm:^4" "@types/js-cookie": "npm:^3.0.2" @@ -11156,6 +11166,7 @@ __metadata: html-to-text: "npm:^9.0.5" http-server: "npm:^14.1.1" husky: "npm:^9.0.11" + image-size: "npm:^1.1.1" indefinite: "npm:^2.4.3" ini: "npm:^4.1.2" instantsearch.js: "npm:^4.72.1" @@ -11737,6 +11748,17 @@ __metadata: languageName: node linkType: hard +"image-size@npm:*, image-size@npm:^1.1.1": + version: 1.1.1 + resolution: "image-size@npm:1.1.1" + dependencies: + queue: "npm:6.0.2" + bin: + image-size: bin/image-size.js + checksum: 10/f28966dd3f6d4feccc4028400bb7e8047c28b073ab0aa90c7c53039288139dd416c6bc254a976d4bf61113d4bc84871786804113099701cbfe9ccf377effdb54 + languageName: node + linkType: hard + "immutable@npm:^4.0.0, immutable@npm:^4.3.6": version: 4.3.6 resolution: "immutable@npm:4.3.6" @@ -16327,6 +16349,15 @@ __metadata: languageName: node linkType: hard +"queue@npm:6.0.2": + version: 6.0.2 + resolution: "queue@npm:6.0.2" + dependencies: + inherits: "npm:~2.0.3" + checksum: 10/3437954ef1442c86ff01a0fbe3dc6222838823b1ca97f37eff651bc20b868c0c2904424ef2c0d44cba46055f54b578f92866e573125dc9a5e8823d751e4d1585 + languageName: node + linkType: hard + "quick-lru@npm:^4.0.1": version: 4.0.1 resolution: "quick-lru@npm:4.0.1"