Replace rss-parser and manual work with extractus

This commit is contained in:
Carter McBride 2024-09-13 14:59:26 -06:00
parent 9d7275b69e
commit 10d017a511
3 changed files with 15 additions and 119 deletions

BIN
bun.lockb

Binary file not shown.

View file

@ -28,8 +28,8 @@
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"@astrojs/tailwind": "^5.1.0", "@astrojs/tailwind": "^5.1.0",
"astro": "4.15.6", "@extractus/feed-extractor": "^7.1.3",
"rss-parser": "^3.13.0" "astro": "4.15.6"
}, },
"devDependencies": { "devDependencies": {
"@biomejs/biome": "^1.8.1", "@biomejs/biome": "^1.8.1",

View file

@ -1,4 +1,4 @@
import Parser from "rss-parser"; import { extract } from "@extractus/feed-extractor";
interface FeedItem { interface FeedItem {
title: string; title: string;
@ -9,10 +9,6 @@ interface FeedItem {
category: string; category: string;
} }
const MAX_CONNECTIONS = Number.POSITIVE_INFINITY;
const DELAY_MS = 850;
const parser = new Parser();
function readFeedCategoriesFromEnv(): Record<string, string[]> { function readFeedCategoriesFromEnv(): Record<string, string[]> {
if (import.meta.env.FEEDS) { if (import.meta.env.FEEDS) {
return JSON.parse(import.meta.env.FEEDS); return JSON.parse(import.meta.env.FEEDS);
@ -20,130 +16,30 @@ function readFeedCategoriesFromEnv(): Record<string, string[]> {
throw new Error("FEEDS environment variable is not set"); throw new Error("FEEDS environment variable is not set");
} }
async function getRawFeedContents(response: Response): Promise<unknown> {
const contentType = response.headers.get("content-type")?.split(";")[0];
if (!contentType) return {};
if (
[
"application/atom+xml",
"application/rss+xml",
"application/xml",
"text/xml",
"text/html",
].includes(contentType)
) {
return response.text();
}
if (["application/json", "application/feed+json"].includes(contentType)) {
return response.json();
}
return {};
}
interface RawFeedItem {
creator?: string;
title: string;
link: string;
pubDate: string;
"content:encoded"?: string;
"content:encodedSnippet"?: string;
"dc:creator"?: string;
comments?: string;
content: string;
contentSnippet: string;
guid: string;
categories: unknown[];
isoDate: string;
[other: string]: unknown;
}
interface RawFeed {
items: RawFeedItem[];
feedUrl?: string;
image?: {
link: string;
url: string;
title: string;
width: string;
height: string;
};
pagenationLinks?: {
self: string;
next: string;
};
title: string;
description: string;
generator: string;
link: string;
language?: string;
lastBuildDate?: string;
[other: string]: unknown;
}
function getTitle(item: RawFeed | RawFeedItem): string {
const titleValues: (keyof RawFeed | keyof RawFeedItem)[] = [
"title",
"url",
"link",
];
const keys = Object.keys(item);
const titleProperty = titleValues.find(
(titleValue) => keys.includes(titleValue) && item[titleValue],
);
return titleProperty ? (item[titleProperty] as string) : "";
}
function getLink(item: RawFeed | RawFeedItem): string {
const linkValues: (keyof RawFeed | keyof RawFeedItem)[] = [
"link",
"url",
"guid",
"home_page_url",
];
const keys = Object.keys(item);
const linkProperty = linkValues.find((linkValue) => keys.includes(linkValue));
return linkProperty ? (item[linkProperty] as string) : "";
}
function getTimestamp(item: RawFeedItem): number {
const dateString =
item.pubDate || item.isoDate || item.date || item.date_published;
if (!dateString || typeof dateString !== "string") {
return Date.now();
}
const timestamp = new Date(dateString).getTime();
return Number.isNaN(timestamp) ? Date.now() : timestamp;
}
async function parseFeedContents( async function parseFeedContents(
feedUrl: string, feedUrl: string,
category: string, category: string,
): Promise<FeedItem[]> { ): Promise<FeedItem[]> {
console.log(`Fetching: ${feedUrl}...`); console.log(`Fetching: ${feedUrl}...`);
const response = await fetch(feedUrl); let items: FeedItem[] = [];
const body = await getRawFeedContents(response);
if (!body) {
throw new Error(`Failed to fetch feed: ${feedUrl}`);
}
try { try {
const rawFeed = ( const result = await extract(feedUrl, {
typeof body === "string" ? await parser.parseString(body) : body descriptionMaxLen: 1,
) as RawFeed; useISODateFormat: false,
const feedName = getTitle(rawFeed); });
const feedLink = getLink(rawFeed); items = (result.entries ?? []).map((entry) => ({
const items: FeedItem[] = rawFeed.items.flatMap((item) => ({ feedName: result.title,
feedName, feedLink: result.link,
feedLink,
category, category,
title: item.title, title: entry.title,
pubIsoDate: getTimestamp(item), pubIsoDate: new Date(entry.published).getTime(),
link: item.link, link: entry.link,
})); }));
return items;
} catch (err) { } catch (err) {
console.error(`${feedUrl}\n${err}`); console.error(`${feedUrl}\n${err}`);
throw err; throw err;
} }
return items;
} }
export default async function getAllFeedItems(): Promise<{ export default async function getAllFeedItems(): Promise<{