RSS-Bridge Custom Bridges
RSS Bridge of Xpath Selector
<?php
class NewsSiteBridge extends XPathAbstract {
const NAME = 'News Site Bridge'; // Name of the bridge, usually the website or service name.
const URI = 'https://newssite.com/'; // Base URL of the news website.
const DESCRIPTION = 'Returns the latest news articles from NewsSite.com'; // A brief description of the bridge's purpose.
const MAINTAINER = 'jctopping'; // The maintainer's username or identifier.
const CACHE_TIMEOUT = 3600; // Time in seconds to cache the content, e.g., 3600 seconds for 1 hour.
const FEED_SOURCE_URL = 'https://newssite.com/'; // Often the same as URI, the URL from which the feed generates content.
const XPATH_EXPRESSION_ITEM = "//div[contains(@class, 'article-list')]//article"; // XPath to locate individual articles within the list.
const XPATH_EXPRESSION_ITEM_TITLE = ".//h2[contains(@class, 'article-title')]/a"; // XPath to find each article's title.
const XPATH_EXPRESSION_ITEM_CONTENT = ".//div[contains(@class, 'article-summary')]"; // XPath to extract a summary or full content from either the main feed page or individual article pages, depending on the context of use.
const USE_PAGE_CONTENT = true; // When set to true, this flag indicates that the bridge should fetch and parse the full content of individual article pages. The content to be parsed and included is determined by the XPATH_EXPRESSION_ITEM_CONTENT expression, applied to each article's detailed page as specified by its URI.
const XPATH_EXPRESSION_ITEM_URI = ".//h2[contains(@class, 'article-title')]/a/@href"; // XPath to extract the URL to the full article.
const XPATH_EXPRESSION_ITEM_AUTHOR = ".//span[contains(@class, 'author-name')]"; // XPath to find the name of the article's author.
const XPATH_EXPRESSION_ITEM_TIMESTAMP = ".//time[contains(@class, 'published-date')]/@datetime"; // XPath for the article's publication date.
const DATE_FIELD_FORMAT = 'Y-m-d\TH:i:sP'; // The expected format of the date/time string (ISO 8601 format).
const XPATH_EXPRESSION_ITEM_ENCLOSURES = ".//figure[contains(@class, 'article-image')]/img/@src"; // XPath to locate the main image or other media.
const XPATH_EXPRESSION_ITEM_CATEGORIES = ".//div[contains(@class, 'article-categories')]/a"; // XPath to extract article categories or tags.
const SETTING_FIX_ENCODING = false; // Set to true if there's a need to correct encoding issues, otherwise false.
public function collectData() {
parent::collectData();
if (self::USE_PAGE_CONTENT) {
foreach ($this->items as &$item) {
$itemContent = $this->fetchAndProcessContent($item->uri);
if (!empty($itemContent)) {
$item->content = $itemContent;
} else {
echo "No content found or processed for item: " . $item->uri . "\n";
}
}
}
}
protected function fetchAndProcessContent($uri) {
$webPageContent = getContents($uri);
if (!$webPageContent) {
echo "Failed to fetch content for URI: $uri\n";
return '';
}
$dom = new \DOMDocument();
@$dom->loadHTML($webPageContent, LIBXML_NOWARNING | LIBXML_NOERROR);
$xpath = new \DOMXPath($dom);
$contentNodes = $xpath->query(self::XPATH_EXPRESSION_ITEM_CONTENT);
if (!$contentNodes) {
echo "Failed to find content using XPath for URI: $uri\n";
return '';
}
$innerHTML = '';
foreach ($contentNodes as $node) {
$innerHTML .= $dom->saveHTML($node);
}
return $innerHTML;
}
protected function generateItemId(FeedItem $item)
{
return $item->getURI();
}
protected function formatItemTimestamp($value)
{
$dti = DateTimeImmutable::createFromFormat(self::DATE_FIELD_FORMAT, $value);
if ($dti === false) {
echo "Failed to parse timestamp: $value\n";
return false;
}
return $dti->getTimestamp();
}
}