From 7a70bf73a2a05e8a03c861cfdfcf2ebe6d4fa66c Mon Sep 17 00:00:00 2001 From: Starbeamrainbowlabs Date: Mon, 29 Jul 2019 01:12:36 +0100 Subject: [PATCH] It (basically) works! We've still got a few bugs to work out though. --- PolyFeed/FeedBuilder.cs | 94 ++++++++++++++++++--------- PolyFeed/FeedSource.cs | 88 +++++++++++++++---------- PolyFeed/Helpers/HtmlHelpers.cs | 34 ++++++++++ PolyFeed/PolyFeed.csproj | 3 + PolyFeed/Program.cs | 82 +++++++++++++---------- PolyFeed/ReferenceSubstitutor.cs | 11 +++- PolyFeed/SnakeCasePropertySelector.cs | 27 ++++++++ examples/twitter.toml | 23 +++++++ 8 files changed, 261 insertions(+), 101 deletions(-) create mode 100644 PolyFeed/Helpers/HtmlHelpers.cs create mode 100644 PolyFeed/SnakeCasePropertySelector.cs create mode 100644 examples/twitter.toml diff --git a/PolyFeed/FeedBuilder.cs b/PolyFeed/FeedBuilder.cs index 9359efa..17c5a9e 100644 --- a/PolyFeed/FeedBuilder.cs +++ b/PolyFeed/FeedBuilder.cs @@ -9,6 +9,7 @@ using Fizzler.Systems.HtmlAgilityPack; using HtmlAgilityPack; using Microsoft.SyndicationFeed; using Microsoft.SyndicationFeed.Atom; +using PolyFeed.Helpers; namespace PolyFeed { @@ -19,18 +20,22 @@ namespace PolyFeed AtomFeedWriter feed = null; public FeedBuilder() { - xml = XmlWriter.Create(result); + xml = XmlWriter.Create(result, new XmlWriterSettings() { + Indent = true, + IndentChars = "\t" + }); feed = new AtomFeedWriter(xml); } public async Task AddSource(FeedSource source) { - WebResponse response = await WebRequest.Create(source.Url).GetResponseAsync(); - + await Console.Error.WriteLineAsync("[Builder] Downloading content"); + WebResponse response = await WebRequest.Create(source.Feed.Url).GetResponseAsync(); + await Console.Error.WriteLineAsync("[Builder] Generating feed header"); // Write the header - await feed.WriteGenerator("Polyfeed", "https://gitlab.com/sbrl/PolyFeed.git", Program.GetProgramVersion()); - await feed.WriteId(source.Url); + await feed.WriteGenerator("Polyfeed", "https://github.com/sbrl/PolyFeed.git", Program.GetProgramVersion()); + await feed.WriteId(source.Feed.Url); string lastModified = response.Headers.Get("last-modified"); if (string.IsNullOrWhiteSpace(lastModified)) await feed.WriteUpdated(DateTimeOffset.Now); @@ -39,63 +44,92 @@ namespace PolyFeed string contentType = response.Headers.Get("content-type"); - switch (source.SourceType) { + switch (source.Feed.Type) { case SourceType.HTML: await AddSourceHtml(source, response); break; default: - throw new NotImplementedException($"Error: The source type {source.SourceType} hasn't been implemented yet."); + throw new NotImplementedException($"Error: The source type {source.Feed.Type} hasn't been implemented yet."); } + + await Console.Error.WriteLineAsync("[Builder] Done!"); } private async Task AddSourceHtml(FeedSource source, WebResponse response) { + await Console.Error.WriteLineAsync("[Builder/Html] Parsing Html"); + + // Parse the HTML HtmlDocument html = new HtmlDocument(); using (StreamReader reader = new StreamReader(response.GetResponseStream())) html.LoadHtml(await reader.ReadToEndAsync()); HtmlNode document = html.DocumentNode; - await feed.WriteTitle(ReferenceSubstitutor.Replace(source.Title, document)); - await feed.WriteSubtitle(ReferenceSubstitutor.Replace(source.Subtitle, document)); + await Console.Error.WriteLineAsync("[Builder/Html] Generating feed content"); - foreach (HtmlNode nextNode in document.QuerySelectorAll(source.EntrySelector)) { - HtmlNode urlNode = nextNode.QuerySelector(source.EntryUrlSelector); - string url = source.EntryUrlAttribute == string.Empty ? - urlNode.InnerText : urlNode.Attributes[source.EntryUrlAttribute].DeEntitizeValue; + // Add the title + await feed.WriteTitle(ReferenceSubstitutor.Replace(source.Feed.Title, document)); + await feed.WriteSubtitle(ReferenceSubstitutor.Replace(source.Feed.Subtitle, document)); + + // Add the logo + if (source.Feed.Logo != null) { + HtmlNode logoNode = document.QuerySelector(source.Feed.Logo.Selector); + xml.WriteElementString("logo", logoNode.Attributes[source.Feed.Logo.Attribute].Value); + } + + // Add the feed entries + foreach (HtmlNode nextNode in document.QuerySelectorAll(source.Entries.Selector)) { + HtmlNode urlNode = nextNode.QuerySelector(source.Entries.Url.Selector); + if (urlNode == null) + throw new ApplicationException("Error: Failed to match entry url selector against an element."); + + string url = source.Entries.Url.Attribute == string.Empty ? + urlNode.InnerText : urlNode.Attributes[source.Entries.Url.Attribute].DeEntitizeValue; SyndicationItem nextItem = new SyndicationItem() { - Id = url, - Title = ReferenceSubstitutor.Replace(source.EntryTitle, nextNode), - Description = ReferenceSubstitutor.Replace(source.EntryContent, nextNode) + Id = new Uri(new Uri(source.Feed.Url), new Uri(url)).ToString(), + Title = ReferenceSubstitutor.Replace(source.Entries.Title, nextNode), + Description = ReferenceSubstitutor.Replace(source.Entries.Content, nextNode), }; - if (source.EntryPublishedSelector != string.Empty) { - HtmlNode publishedNode = nextNode.QuerySelector(source.EntryPublishedSelector); + if (source.Entries.Published != null) { nextItem.Published = DateTime.Parse( - source.EntryPublishedAttribute == string.Empty - ? publishedNode.InnerText - : publishedNode.Attributes[source.EntryPublishedAttribute].DeEntitizeValue + nextNode.QuerySelectorAttributeOrText( + source.Entries.Published + ) ); } - if (source.EntryPublishedSelector != string.Empty) { - HtmlNode lastUpdatedNode = nextNode.QuerySelector(source.EntryLastUpdatedSelector); - nextItem.Published = DateTime.Parse( - source.EntryLastUpdatedAttribute == string.Empty - ? lastUpdatedNode.InnerText - : lastUpdatedNode.Attributes[source.EntryLastUpdatedAttribute].DeEntitizeValue + if (source.Entries.Published != null) { + nextItem.LastUpdated = DateTime.Parse( + nextNode.QuerySelectorAttributeOrText( + source.Entries.LastUpdated + ) ); } + else // It requires one, apparently + nextItem.LastUpdated = DateTimeOffset.Now; + + SyndicationPerson author = new SyndicationPerson( + nextNode.QuerySelectorAttributeOrText(source.Entries.AuthorName).Trim(), + "" + ); + if(source.Entries.AuthorUrl != null) + author.Uri = nextNode.QuerySelectorAttributeOrText(source.Entries.AuthorUrl); + + nextItem.AddContributor(author); + + await feed.Write(nextItem); - await feed.Write(nextItem); } } - public string Render() + public async Task Render() { - xml.Flush(); + await feed.Flush(); xml.WriteEndDocument(); + xml.Flush(); return result.ToString(); } } diff --git a/PolyFeed/FeedSource.cs b/PolyFeed/FeedSource.cs index ffd629e..5a9de37 100644 --- a/PolyFeed/FeedSource.cs +++ b/PolyFeed/FeedSource.cs @@ -4,8 +4,29 @@ namespace PolyFeed { public enum SourceType { HTML, XML, JSON }; - public class FeedSource + public class SelectorSettings { + /// + /// A selector that matches against an element to select. + /// + public string Selector { get; set; } + /// + /// The name of the attribute to get the value of. + /// Set to an empty string to select the content of the element instead of the + /// content of an attribute. + /// + public string Attribute { get; set; } + + public override string ToString() + { + return $"[SelectorSettings Selector = {Selector}, Attribute = {Attribute}]"; + } + } + + public class FeedSettings + { + public string Output { get; set; } + /// /// The url of the source document to parse. /// @@ -15,7 +36,9 @@ namespace PolyFeed /// /// The type of source document to expect. /// - public SourceType SourceType { get; set; } + public string SourceType { get; set; } + public SourceType Type => (SourceType)Enum.Parse(typeof(SourceType), SourceType, true); + /// /// The title of the feed. @@ -29,22 +52,14 @@ namespace PolyFeed /// The subtitle. public string Subtitle { get; set; } - - #region Entries - /// - /// A selector that matches against an element that contains the URL that an - /// entry should link to. - /// Relative to the element selected by . + /// Selector that matches against the feed logo url. /// - public string EntryUrlSelector { get; set; } - /// - /// The name of the attribute on the element selected by . - /// Set to an empty string to select the content of the element instead of the - /// content of an attribute. - /// - public string EntryUrlAttribute { get; set; } = ""; + public SelectorSettings Logo { get; set; } + } + public class EntrySettings + { /// /// The selector that specifies the location of nodes in the object model that /// should be added to the feed. @@ -53,41 +68,42 @@ namespace PolyFeed /// - XML: XPath (e.g. //element_name) /// - JSON: Dotted object (e.g. items.fruit) /// - public string EntrySelector { get; set; } + public string Selector { get; set; } + /// + /// Selector settings to get the URL that an entry should link to. + /// + public SelectorSettings Url { get; set; } = new SelectorSettings() { Attribute = "href" }; + /// /// The title of an entry. /// Selectors may be included in curly braces {} to substitute in content. /// Such selectors are relative to the current feed entry. - /// The format varies in the samem way as does. + /// The format varies in the same way as does. /// - public string EntryTitle { get; set; } + public string Title { get; set; } /// - /// Same as , but for the body of an entry. HTML is allowed. + /// Same as , but for the body of an entry. HTML is allowed. /// - public string EntryContent { get; set; } + public string Content { get; set; } /// - /// The selector for the node that contains the date published for an entry. + /// The selector for the date published for an entry. /// - public string EntryPublishedSelector { get; set; } + public SelectorSettings Published { get; set; } /// - /// The name of the attribute that contains the date published for an entry. - /// Set to to use the content of the node itself. + /// The selector for the date published for an entry. /// - public string EntryPublishedAttribute { get; set; } + public SelectorSettings LastUpdated { get; set; } - /// - /// Same as , but for the last updated. - /// If not specified, the last updated will be omitted. - /// - public string EntryLastUpdatedSelector { get; set; } - /// - /// Same as . - /// - public string EntryLastUpdatedAttribute { get; set; } - - #endregion + public SelectorSettings AuthorName { get; set; } + public SelectorSettings AuthorUrl { get; set; } } + + public class FeedSource + { + public FeedSettings Feed { get; set; } + public EntrySettings Entries { get; set; } + } } diff --git a/PolyFeed/Helpers/HtmlHelpers.cs b/PolyFeed/Helpers/HtmlHelpers.cs new file mode 100644 index 0000000..6ffc4d3 --- /dev/null +++ b/PolyFeed/Helpers/HtmlHelpers.cs @@ -0,0 +1,34 @@ +using System; +using Fizzler.Systems.HtmlAgilityPack; +using HtmlAgilityPack; + +namespace PolyFeed.Helpers +{ + public static class HtmlHelpers + { + public static string QuerySelectorAttributeOrText(this HtmlNode htmlNode, SelectorSettings settings) + { + HtmlNode selectedNode = htmlNode.QuerySelector(settings.Selector); + + if (selectedNode == null) + throw new ApplicationException($"Error: Selector {settings.Selector} failed to find any elements."); + + if (string.IsNullOrWhiteSpace(settings.Attribute)) + return selectedNode.InnerText; + + return selectedNode.Attributes[settings.Attribute].Value; + } + public static string QuerySelectorAttributeOrHtml(this HtmlNode htmlNode, SelectorSettings settings) + { + HtmlNode selectedNode = htmlNode.QuerySelector(settings.Selector); + + if (selectedNode == null) + throw new ApplicationException($"Error: Selector {settings.Selector} failed to find any elements."); + + if (string.IsNullOrWhiteSpace(settings.Attribute)) + return selectedNode.InnerHtml; + + return selectedNode.Attributes[settings.Attribute].Value; + } + } +} diff --git a/PolyFeed/PolyFeed.csproj b/PolyFeed/PolyFeed.csproj index 3fdd947..7500696 100644 --- a/PolyFeed/PolyFeed.csproj +++ b/PolyFeed/PolyFeed.csproj @@ -147,12 +147,15 @@ + + + diff --git a/PolyFeed/Program.cs b/PolyFeed/Program.cs index 41635c4..a7777cb 100644 --- a/PolyFeed/Program.cs +++ b/PolyFeed/Program.cs @@ -13,7 +13,7 @@ namespace PolyFeed public readonly string ProgramName = "PolyFeed"; public readonly string Description = "creates Atom feeds from websites that don't support it"; - public string ConfigFilepath = "feed.toml"; + public string ConfigFilepath = null; public string OutputFilepath = "feed.atom"; } @@ -38,20 +38,7 @@ namespace PolyFeed { case "-h": case "--help": - Console.WriteLine($"{settings.ProgramName}, {GetProgramVersion()}"); - Console.WriteLine(" By Starbeamrainbowlabs"); - - Console.WriteLine(); - Console.WriteLine($"This program {settings.Description}."); - Console.WriteLine(); - Console.WriteLine("Usage:"); - Console.WriteLine($" ./{Path.GetFileName(Assembly.GetExecutingAssembly().Location)} [arguments]"); - Console.WriteLine(); - Console.WriteLine("Options:"); - Console.WriteLine(" -h --help Displays this message"); - Console.WriteLine(" -v --version Outputs the version number of this program"); - Console.WriteLine(" -c --config Specifies the location of the feed configuration file to use to generate a feed (default: feed.toml)"); - Console.WriteLine(" -o --output Specifies the location to write the output feed to (default: feed.atom)"); + showHelp(); return 0; case "-v": @@ -71,37 +58,64 @@ namespace PolyFeed } } + if (settings.ConfigFilepath == null) { + Console.Error.WriteLine("Error: No configuration filepath detected. Try " + + "using --help to show usage information."); + return 1; + } + ///// 2: Acquire environment variables ///// ///// 3: Run program ///// - - - return 0; + return run().Result; } - private static async Task run() + private static void showHelp() { - FeedSource feedSource = new FeedSource(); - TomlTable config = Toml.ReadFile(settings.ConfigFilepath, TomlSettings.Create()); + Console.WriteLine($"{settings.ProgramName}, {GetProgramVersion()}"); + Console.WriteLine(" By Starbeamrainbowlabs"); - foreach (KeyValuePair item in config) { - string key = Regex.Replace( - item.Key, - @"(^|_)[A-Za-z0-9]", - (match) => match.Value.Replace("_", "").ToUpper() - ); - string value = item.Value.Get().Value; - feedSource.GetType().GetProperty(value).SetValue( - feedSource, - value - ); + Console.WriteLine(); + Console.WriteLine($"This program {settings.Description}."); + Console.WriteLine(); + Console.WriteLine("Usage:"); + Console.WriteLine($" ./{Path.GetFileName(Assembly.GetExecutingAssembly().Location)} [arguments]"); + Console.WriteLine(); + Console.WriteLine("Options:"); + Console.WriteLine(" -h --help Displays this message"); + Console.WriteLine(" -v --version Outputs the version number of this program"); + Console.WriteLine(" -c --config Specifies the location of the TOML feed configuration file to use to generate a feed"); + Console.WriteLine(" -o --output Specifies the location to write the output feed to (default: feed.atom)"); + } + + private static async Task run() + { + TomlSettings parseSettings = TomlSettings.Create(s => + s.ConfigurePropertyMapping(m => m.UseTargetPropertySelector(new SnakeCasePropertySelector())) + ); + FeedSource feedSource = Toml.ReadFile(settings.ConfigFilepath, parseSettings); + + if (feedSource == null) { + Console.Error.WriteLine("Error: Somethine went wrong when parsing your settings file :-("); + return 1; } + if (!string.IsNullOrWhiteSpace(feedSource.Feed.Output)) + settings.OutputFilepath = feedSource.Feed.Output; + FeedBuilder feedBuilder = new FeedBuilder(); - await feedBuilder.AddSource(feedSource); - return await feedBuilder.Render(); + try { + await feedBuilder.AddSource(feedSource); + } catch (ApplicationException error) { + Console.Error.WriteLine(error.Message); + return 2; + } + await Console.Error.WriteLineAsync($"[Output] Writing feed to {settings.OutputFilepath}"); + File.WriteAllText(settings.OutputFilepath, await feedBuilder.Render()); + + return 0; } diff --git a/PolyFeed/ReferenceSubstitutor.cs b/PolyFeed/ReferenceSubstitutor.cs index e6d22b9..f410519 100644 --- a/PolyFeed/ReferenceSubstitutor.cs +++ b/PolyFeed/ReferenceSubstitutor.cs @@ -15,10 +15,14 @@ namespace PolyFeed SubstitutionLexer lexer = lexerPool.AcquireLexer(); lexer.Initialise(inputString); + bool useHtml = true; + foreach (LexerToken nextToken in lexer.TokenStream()) { switch (nextToken.Type) { case SubstitutionToken.BraceOpen: + useHtml = nextToken.Value.Length == 1; + lexer.SaveRuleStates(); lexer.EnableRule(SubstitutionToken.Identifier); lexer.DisableRule(SubstitutionToken.Text); @@ -32,7 +36,12 @@ namespace PolyFeed break; case SubstitutionToken.Identifier: - result.Append(rootElement.QuerySelector(nextToken.Value)); + HtmlNode targetNode = rootElement.QuerySelector(nextToken.Value); + if (targetNode == null) { + Console.Error.WriteLine($"Warning: Selector {nextToken.Value} failed to match any elements"); + break; + } + result.Append(useHtml ? targetNode.InnerHtml : targetNode.InnerText); break; } } diff --git a/PolyFeed/SnakeCasePropertySelector.cs b/PolyFeed/SnakeCasePropertySelector.cs new file mode 100644 index 0000000..60128d1 --- /dev/null +++ b/PolyFeed/SnakeCasePropertySelector.cs @@ -0,0 +1,27 @@ +using System; +using System.Reflection; +using System.Text.RegularExpressions; +using Nett; + +namespace PolyFeed +{ + public class SnakeCasePropertySelector : ITargetPropertySelector + { + public SnakeCasePropertySelector() + { + } + + public PropertyInfo TryGetTargetProperty(string key, Type target) + { + string transformedKey = Regex.Replace( + key, + @"(^|_)[A-Za-z0-9]", + (match) => match.Value.Replace("_", "").ToUpper() + ); + + //Console.WriteLine($"{key} -> {transformedKey}"); + + return target.GetProperty(transformedKey); + } + } +} diff --git a/examples/twitter.toml b/examples/twitter.toml new file mode 100644 index 0000000..95fc8fd --- /dev/null +++ b/examples/twitter.toml @@ -0,0 +1,23 @@ +[feed] +output = "CrossCodeLea-Twitter.atom" + +url = "https://mobile.twitter.com/CrossCodeLea" + +source_type = "html" + +title = "{{.username}} on Twitter" +subtitle = "{{.details}}" + +logo_url = { selector = ".avatar img", attribute = "src" } + +[entries] +selector = ".tweet" +title = "Tweet by {.username}" +content = "{.tweet-social-context}
{.avatar}{.fullname}
{.tweet-content}" + +url = { selector = ".metadata a", attribute = "href" } + +author_name = { selector = ".username" } + +# published = { selector = "", attribute = "" } +# last_updated = { selector = "", attribute = "" }