1
0
Fork 0
mirror of https://github.com/sbrl/PolyFeed.git synced 2024-12-24 10:45:02 +00:00

It (basically) works! We've still got a few bugs to work out though.

This commit is contained in:
Starbeamrainbowlabs 2019-07-29 01:12:36 +01:00
parent 544bce4f54
commit 7a70bf73a2
Signed by: sbrl
GPG key ID: 1BE5172E637709C2
8 changed files with 261 additions and 101 deletions

View file

@ -9,6 +9,7 @@ using Fizzler.Systems.HtmlAgilityPack;
using HtmlAgilityPack;
using Microsoft.SyndicationFeed;
using Microsoft.SyndicationFeed.Atom;
using PolyFeed.Helpers;
namespace PolyFeed
{
@ -19,18 +20,22 @@ namespace PolyFeed
AtomFeedWriter feed = null;
public FeedBuilder() {
xml = XmlWriter.Create(result);
xml = XmlWriter.Create(result, new XmlWriterSettings() {
Indent = true,
IndentChars = "\t"
});
feed = new AtomFeedWriter(xml);
}
public async Task AddSource(FeedSource source) {
WebResponse response = await WebRequest.Create(source.Url).GetResponseAsync();
await Console.Error.WriteLineAsync("[Builder] Downloading content");
WebResponse response = await WebRequest.Create(source.Feed.Url).GetResponseAsync();
await Console.Error.WriteLineAsync("[Builder] Generating feed header");
// Write the header
await feed.WriteGenerator("Polyfeed", "https://gitlab.com/sbrl/PolyFeed.git", Program.GetProgramVersion());
await feed.WriteId(source.Url);
await feed.WriteGenerator("Polyfeed", "https://github.com/sbrl/PolyFeed.git", Program.GetProgramVersion());
await feed.WriteId(source.Feed.Url);
string lastModified = response.Headers.Get("last-modified");
if (string.IsNullOrWhiteSpace(lastModified))
await feed.WriteUpdated(DateTimeOffset.Now);
@ -39,63 +44,92 @@ namespace PolyFeed
string contentType = response.Headers.Get("content-type");
switch (source.SourceType) {
switch (source.Feed.Type) {
case SourceType.HTML:
await AddSourceHtml(source, response);
break;
default:
throw new NotImplementedException($"Error: The source type {source.SourceType} hasn't been implemented yet.");
throw new NotImplementedException($"Error: The source type {source.Feed.Type} hasn't been implemented yet.");
}
await Console.Error.WriteLineAsync("[Builder] Done!");
}
private async Task AddSourceHtml(FeedSource source, WebResponse response) {
await Console.Error.WriteLineAsync("[Builder/Html] Parsing Html");
// Parse the HTML
HtmlDocument html = new HtmlDocument();
using (StreamReader reader = new StreamReader(response.GetResponseStream()))
html.LoadHtml(await reader.ReadToEndAsync());
HtmlNode document = html.DocumentNode;
await feed.WriteTitle(ReferenceSubstitutor.Replace(source.Title, document));
await feed.WriteSubtitle(ReferenceSubstitutor.Replace(source.Subtitle, document));
await Console.Error.WriteLineAsync("[Builder/Html] Generating feed content");
foreach (HtmlNode nextNode in document.QuerySelectorAll(source.EntrySelector)) {
HtmlNode urlNode = nextNode.QuerySelector(source.EntryUrlSelector);
string url = source.EntryUrlAttribute == string.Empty ?
urlNode.InnerText : urlNode.Attributes[source.EntryUrlAttribute].DeEntitizeValue;
// Add the title
await feed.WriteTitle(ReferenceSubstitutor.Replace(source.Feed.Title, document));
await feed.WriteSubtitle(ReferenceSubstitutor.Replace(source.Feed.Subtitle, document));
// Add the logo
if (source.Feed.Logo != null) {
HtmlNode logoNode = document.QuerySelector(source.Feed.Logo.Selector);
xml.WriteElementString("logo", logoNode.Attributes[source.Feed.Logo.Attribute].Value);
}
// Add the feed entries
foreach (HtmlNode nextNode in document.QuerySelectorAll(source.Entries.Selector)) {
HtmlNode urlNode = nextNode.QuerySelector(source.Entries.Url.Selector);
if (urlNode == null)
throw new ApplicationException("Error: Failed to match entry url selector against an element.");
string url = source.Entries.Url.Attribute == string.Empty ?
urlNode.InnerText : urlNode.Attributes[source.Entries.Url.Attribute].DeEntitizeValue;
SyndicationItem nextItem = new SyndicationItem() {
Id = url,
Title = ReferenceSubstitutor.Replace(source.EntryTitle, nextNode),
Description = ReferenceSubstitutor.Replace(source.EntryContent, nextNode)
Id = new Uri(new Uri(source.Feed.Url), new Uri(url)).ToString(),
Title = ReferenceSubstitutor.Replace(source.Entries.Title, nextNode),
Description = ReferenceSubstitutor.Replace(source.Entries.Content, nextNode),
};
if (source.EntryPublishedSelector != string.Empty) {
HtmlNode publishedNode = nextNode.QuerySelector(source.EntryPublishedSelector);
if (source.Entries.Published != null) {
nextItem.Published = DateTime.Parse(
source.EntryPublishedAttribute == string.Empty
? publishedNode.InnerText
: publishedNode.Attributes[source.EntryPublishedAttribute].DeEntitizeValue
nextNode.QuerySelectorAttributeOrText(
source.Entries.Published
)
);
}
if (source.EntryPublishedSelector != string.Empty) {
HtmlNode lastUpdatedNode = nextNode.QuerySelector(source.EntryLastUpdatedSelector);
nextItem.Published = DateTime.Parse(
source.EntryLastUpdatedAttribute == string.Empty
? lastUpdatedNode.InnerText
: lastUpdatedNode.Attributes[source.EntryLastUpdatedAttribute].DeEntitizeValue
if (source.Entries.Published != null) {
nextItem.LastUpdated = DateTime.Parse(
nextNode.QuerySelectorAttributeOrText(
source.Entries.LastUpdated
)
);
}
else // It requires one, apparently
nextItem.LastUpdated = DateTimeOffset.Now;
SyndicationPerson author = new SyndicationPerson(
nextNode.QuerySelectorAttributeOrText(source.Entries.AuthorName).Trim(),
""
);
if(source.Entries.AuthorUrl != null)
author.Uri = nextNode.QuerySelectorAttributeOrText(source.Entries.AuthorUrl);
nextItem.AddContributor(author);
await feed.Write(nextItem);
await feed.Write(nextItem);
}
}
public string Render()
public async Task<string> Render()
{
xml.Flush();
await feed.Flush();
xml.WriteEndDocument();
xml.Flush();
return result.ToString();
}
}

View file

@ -4,8 +4,29 @@ namespace PolyFeed
{
public enum SourceType { HTML, XML, JSON };
public class FeedSource
public class SelectorSettings
{
/// <summary>
/// A selector that matches against an element to select.
/// </summary>
public string Selector { get; set; }
/// <summary>
/// The name of the attribute to get the value of.
/// Set to an empty string to select the content of the element instead of the
/// content of an attribute.
/// </summary>
public string Attribute { get; set; }
public override string ToString()
{
return $"[SelectorSettings Selector = {Selector}, Attribute = {Attribute}]";
}
}
public class FeedSettings
{
public string Output { get; set; }
/// <summary>
/// The url of the source document to parse.
/// </summary>
@ -15,7 +36,9 @@ namespace PolyFeed
/// <summary>
/// The type of source document to expect.
/// </summary>
public SourceType SourceType { get; set; }
public string SourceType { get; set; }
public SourceType Type => (SourceType)Enum.Parse(typeof(SourceType), SourceType, true);
/// <summary>
/// The title of the feed.
@ -29,22 +52,14 @@ namespace PolyFeed
/// <value>The subtitle.</value>
public string Subtitle { get; set; }
#region Entries
/// <summary>
/// A selector that matches against an element that contains the URL that an
/// entry should link to.
/// Relative to the element selected by <see cref="EntrySelector" />.
/// Selector that matches against the feed logo url.
/// </summary>
public string EntryUrlSelector { get; set; }
/// <summary>
/// The name of the attribute on the element selected by <see cref="EntryUrlSelector" />.
/// Set to an empty string to select the content of the element instead of the
/// content of an attribute.
/// </summary>
public string EntryUrlAttribute { get; set; } = "";
public SelectorSettings Logo { get; set; }
}
public class EntrySettings
{
/// <summary>
/// The selector that specifies the location of nodes in the object model that
/// should be added to the feed.
@ -53,41 +68,42 @@ namespace PolyFeed
/// - XML: XPath (e.g. //element_name)
/// - JSON: Dotted object (e.g. items.fruit)
/// </summary>
public string EntrySelector { get; set; }
public string Selector { get; set; }
/// <summary>
/// Selector settings to get the URL that an entry should link to.
/// </summary>
public SelectorSettings Url { get; set; } = new SelectorSettings() { Attribute = "href" };
/// <summary>
/// The title of an entry.
/// Selectors may be included in curly braces {} to substitute in content.
/// Such selectors are relative to the current feed entry.
/// The format varies in the samem way as <see cref="EntrySelector" /> does.
/// The format varies in the same way as <see cref="Selector" /> does.
/// </summary>
public string EntryTitle { get; set; }
public string Title { get; set; }
/// <summary>
/// Same as <see cref="EntryTitle" />, but for the body of an entry. HTML is allowed.
/// Same as <see cref="Title" />, but for the body of an entry. HTML is allowed.
/// </summary>
public string EntryContent { get; set; }
public string Content { get; set; }
/// <summary>
/// The selector for the node that contains the date published for an entry.
/// The selector for the date published for an entry.
/// </summary>
public string EntryPublishedSelector { get; set; }
public SelectorSettings Published { get; set; }
/// <summary>
/// The name of the attribute that contains the date published for an entry.
/// Set to <see cref="string.Empty" /> to use the content of the node itself.
/// The selector for the date published for an entry.
/// </summary>
public string EntryPublishedAttribute { get; set; }
public SelectorSettings LastUpdated { get; set; }
/// <summary>
/// Same as <see cref="EntryPublishedSelector" />, but for the last updated.
/// If not specified, the last updated will be omitted.
/// </summary>
public string EntryLastUpdatedSelector { get; set; }
/// <summary>
/// Same as <see cref="EntryPublishedAttribute" />.
/// </summary>
public string EntryLastUpdatedAttribute { get; set; }
#endregion
public SelectorSettings AuthorName { get; set; }
public SelectorSettings AuthorUrl { get; set; }
}
public class FeedSource
{
public FeedSettings Feed { get; set; }
public EntrySettings Entries { get; set; }
}
}

View file

@ -0,0 +1,34 @@
using System;
using Fizzler.Systems.HtmlAgilityPack;
using HtmlAgilityPack;
namespace PolyFeed.Helpers
{
public static class HtmlHelpers
{
public static string QuerySelectorAttributeOrText(this HtmlNode htmlNode, SelectorSettings settings)
{
HtmlNode selectedNode = htmlNode.QuerySelector(settings.Selector);
if (selectedNode == null)
throw new ApplicationException($"Error: Selector {settings.Selector} failed to find any elements.");
if (string.IsNullOrWhiteSpace(settings.Attribute))
return selectedNode.InnerText;
return selectedNode.Attributes[settings.Attribute].Value;
}
public static string QuerySelectorAttributeOrHtml(this HtmlNode htmlNode, SelectorSettings settings)
{
HtmlNode selectedNode = htmlNode.QuerySelector(settings.Selector);
if (selectedNode == null)
throw new ApplicationException($"Error: Selector {settings.Selector} failed to find any elements.");
if (string.IsNullOrWhiteSpace(settings.Attribute))
return selectedNode.InnerHtml;
return selectedNode.Attributes[settings.Attribute].Value;
}
}
}

View file

@ -147,12 +147,15 @@
<Compile Include="SubstitutionLexer.cs" />
<Compile Include="Salamander.Core\LexerPool.cs" />
<Compile Include="ReferenceSubstitutor.cs" />
<Compile Include="SnakeCasePropertySelector.cs" />
<Compile Include="Helpers\HtmlHelpers.cs" />
</ItemGroup>
<ItemGroup>
<None Include="packages.config" />
</ItemGroup>
<ItemGroup>
<Folder Include="Salamander.Core\" />
<Folder Include="Helpers\" />
</ItemGroup>
<Import Project="$(MSBuildBinPath)\Microsoft.CSharp.targets" />
<Import Project="..\packages\NETStandard.Library.2.0.3\build\netstandard2.0\NETStandard.Library.targets" Condition="Exists('..\packages\NETStandard.Library.2.0.3\build\netstandard2.0\NETStandard.Library.targets')" />

View file

@ -13,7 +13,7 @@ namespace PolyFeed
public readonly string ProgramName = "PolyFeed";
public readonly string Description = "creates Atom feeds from websites that don't support it";
public string ConfigFilepath = "feed.toml";
public string ConfigFilepath = null;
public string OutputFilepath = "feed.atom";
}
@ -38,20 +38,7 @@ namespace PolyFeed
{
case "-h":
case "--help":
Console.WriteLine($"{settings.ProgramName}, {GetProgramVersion()}");
Console.WriteLine(" By Starbeamrainbowlabs");
Console.WriteLine();
Console.WriteLine($"This program {settings.Description}.");
Console.WriteLine();
Console.WriteLine("Usage:");
Console.WriteLine($" ./{Path.GetFileName(Assembly.GetExecutingAssembly().Location)} [arguments]");
Console.WriteLine();
Console.WriteLine("Options:");
Console.WriteLine(" -h --help Displays this message");
Console.WriteLine(" -v --version Outputs the version number of this program");
Console.WriteLine(" -c --config Specifies the location of the feed configuration file to use to generate a feed (default: feed.toml)");
Console.WriteLine(" -o --output Specifies the location to write the output feed to (default: feed.atom)");
showHelp();
return 0;
case "-v":
@ -71,37 +58,64 @@ namespace PolyFeed
}
}
if (settings.ConfigFilepath == null) {
Console.Error.WriteLine("Error: No configuration filepath detected. Try " +
"using --help to show usage information.");
return 1;
}
///// 2: Acquire environment variables /////
///// 3: Run program /////
return 0;
return run().Result;
}
private static async Task<string> run()
private static void showHelp()
{
FeedSource feedSource = new FeedSource();
TomlTable config = Toml.ReadFile(settings.ConfigFilepath, TomlSettings.Create());
Console.WriteLine($"{settings.ProgramName}, {GetProgramVersion()}");
Console.WriteLine(" By Starbeamrainbowlabs");
foreach (KeyValuePair<string, TomlObject> item in config) {
string key = Regex.Replace(
item.Key,
@"(^|_)[A-Za-z0-9]",
(match) => match.Value.Replace("_", "").ToUpper()
);
string value = item.Value.Get<TomlString>().Value;
feedSource.GetType().GetProperty(value).SetValue(
feedSource,
value
);
Console.WriteLine();
Console.WriteLine($"This program {settings.Description}.");
Console.WriteLine();
Console.WriteLine("Usage:");
Console.WriteLine($" ./{Path.GetFileName(Assembly.GetExecutingAssembly().Location)} [arguments]");
Console.WriteLine();
Console.WriteLine("Options:");
Console.WriteLine(" -h --help Displays this message");
Console.WriteLine(" -v --version Outputs the version number of this program");
Console.WriteLine(" -c --config Specifies the location of the TOML feed configuration file to use to generate a feed");
Console.WriteLine(" -o --output Specifies the location to write the output feed to (default: feed.atom)");
}
private static async Task<int> run()
{
TomlSettings parseSettings = TomlSettings.Create(s =>
s.ConfigurePropertyMapping(m => m.UseTargetPropertySelector(new SnakeCasePropertySelector()))
);
FeedSource feedSource = Toml.ReadFile<FeedSource>(settings.ConfigFilepath, parseSettings);
if (feedSource == null) {
Console.Error.WriteLine("Error: Somethine went wrong when parsing your settings file :-(");
return 1;
}
if (!string.IsNullOrWhiteSpace(feedSource.Feed.Output))
settings.OutputFilepath = feedSource.Feed.Output;
FeedBuilder feedBuilder = new FeedBuilder();
await feedBuilder.AddSource(feedSource);
return await feedBuilder.Render();
try {
await feedBuilder.AddSource(feedSource);
} catch (ApplicationException error) {
Console.Error.WriteLine(error.Message);
return 2;
}
await Console.Error.WriteLineAsync($"[Output] Writing feed to {settings.OutputFilepath}");
File.WriteAllText(settings.OutputFilepath, await feedBuilder.Render());
return 0;
}

View file

@ -15,10 +15,14 @@ namespace PolyFeed
SubstitutionLexer lexer = lexerPool.AcquireLexer();
lexer.Initialise(inputString);
bool useHtml = true;
foreach (LexerToken<SubstitutionToken> nextToken in lexer.TokenStream())
{
switch (nextToken.Type) {
case SubstitutionToken.BraceOpen:
useHtml = nextToken.Value.Length == 1;
lexer.SaveRuleStates();
lexer.EnableRule(SubstitutionToken.Identifier);
lexer.DisableRule(SubstitutionToken.Text);
@ -32,7 +36,12 @@ namespace PolyFeed
break;
case SubstitutionToken.Identifier:
result.Append(rootElement.QuerySelector(nextToken.Value));
HtmlNode targetNode = rootElement.QuerySelector(nextToken.Value);
if (targetNode == null) {
Console.Error.WriteLine($"Warning: Selector {nextToken.Value} failed to match any elements");
break;
}
result.Append(useHtml ? targetNode.InnerHtml : targetNode.InnerText);
break;
}
}

View file

@ -0,0 +1,27 @@
using System;
using System.Reflection;
using System.Text.RegularExpressions;
using Nett;
namespace PolyFeed
{
public class SnakeCasePropertySelector : ITargetPropertySelector
{
public SnakeCasePropertySelector()
{
}
public PropertyInfo TryGetTargetProperty(string key, Type target)
{
string transformedKey = Regex.Replace(
key,
@"(^|_)[A-Za-z0-9]",
(match) => match.Value.Replace("_", "").ToUpper()
);
//Console.WriteLine($"{key} -> {transformedKey}");
return target.GetProperty(transformedKey);
}
}
}

23
examples/twitter.toml Normal file
View file

@ -0,0 +1,23 @@
[feed]
output = "CrossCodeLea-Twitter.atom"
url = "https://mobile.twitter.com/CrossCodeLea"
source_type = "html"
title = "{{.username}} on Twitter"
subtitle = "{{.details}}"
logo_url = { selector = ".avatar img", attribute = "src" }
[entries]
selector = ".tweet"
title = "Tweet by {.username}"
content = "{.tweet-social-context}<br />{.avatar}{.fullname}<br />{.tweet-content}"
url = { selector = ".metadata a", attribute = "href" }
author_name = { selector = ".username" }
# published = { selector = "", attribute = "" }
# last_updated = { selector = "", attribute = "" }