Skip to content

Commit

Permalink
Merge pull request #2422 from lmcnulty/add-handling-for-washington-po…
Browse files Browse the repository at this point in the history
…st-parsing

Fetch articles from sites using cookies
  • Loading branch information
kepae authored Nov 28, 2023
2 parents 035af83 + 28f4952 commit 7e6e8aa
Show file tree
Hide file tree
Showing 2 changed files with 92 additions and 1 deletion.
40 changes: 40 additions & 0 deletions site/gatsby-site/cypress/e2e/integration/submit.cy.js
Original file line number Diff line number Diff line change
Expand Up @@ -1798,4 +1798,44 @@ describe('The Submit form', () => {

cy.contains('Please review. Some data is missing.').should('exist');
});

it('Should fetch article', () => {
cy.visit(url);

cy.intercept('GET', parserURL).as('parseNews');

cy.get('input[name="url"]').type(
`https://www.arstechnica.com/gadgets/2017/11/youtube-to-crack-down-on-inappropriate-content-masked-as-kids-cartoons/`
);

cy.get('button').contains('Fetch info').click();

cy.wait('@parseNews');

cy.get('.tw-toast')
.contains('Please verify all information programmatically pulled from the report')
.should('exist');

cy.get('.tw-toast').contains('Error fetching news.').should('not.exist');
});

it('Should fetch article from site using cookies as fallback', () => {
cy.visit(url);

cy.intercept('GET', parserURL).as('parseNews');

cy.get('input[name="url"]').type(
'https://www.washingtonpost.com/technology/2023/02/16/microsoft-bing-ai-chatbot-sydney/'
);

cy.get('button').contains('Fetch info').click();

cy.wait('@parseNews');

cy.get('.tw-toast')
.contains('Please verify all information programmatically pulled from the report')
.should('exist');

cy.get('.tw-toast').contains('Error fetching news.').should('not.exist');
});
});
53 changes: 52 additions & 1 deletion site/gatsby-site/src/api/parseNews.js
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import Parser from '@postlight/parser';
import { format, parseISO } from 'date-fns';
import axios from 'axios';

const stripImages = /!\[[^\]]*\]\((?<filename>.*?)(?="|\))(?<optionalpart>".*")?\)/g;

export default async function handler(req, res) {
const { url } = req.query;

const article = await Parser.parse(url, { contentType: 'markdown' });
const article = await getArticle(url, { cookies: false });

const response = {
title: article.title,
Expand All @@ -21,3 +22,53 @@ export default async function handler(req, res) {

res.status(200).json(response);
}

// Runs first with { cookies: false },
// then on error recurses with { cookies: true } as a fallback.
const getArticle = async (url, config) => {
try {
const parserConfig = { contentType: 'markdown' };

if (config.cookies) {
parserConfig.html = await getHtmlWithCookies(url);
}

const article = await Parser.parse(url, parserConfig);

return article;
} catch (error) {
if (config.cookies) {
throw error;
} else {
return await getArticle(url, { cookies: true });
}
}
};

const getHtmlWithCookies = async (url) => {
const axiosInstance = axios.create();

axiosInstance.defaults.maxRedirects = 0;
axiosInstance.defaults.withCredentials = true;
axiosInstance.defaults.credentials = 'same-origin';

axiosInstance.interceptors.response.use(
(response) => response,
(error) => {
if (error.response && [301, 302].includes(error.response.status)) {
const redirectUrl = error.response.headers.location;

const Cookie = error.response.headers['set-cookie']
.map((cookie) => cookie.split(';')[0])
.join('; ');

return axiosInstance.get(redirectUrl, { headers: { Cookie } });
}
return Promise.reject(error);
}
);

axiosInstance.get(url).then((response) => {
return response.data;
});
};

0 comments on commit 7e6e8aa

Please sign in to comment.