1
0
Fork 0
mirror of https://github.com/sbrl/PolyFeed.git synced 2024-09-27 22:35:59 +00:00

Compare commits

..

7 commits

11 changed files with 347 additions and 109 deletions

1
.gitignore vendored
View file

@ -1,3 +1,4 @@
*.atom
# Created by https://www.gitignore.io/api/visualstudio,monodevelop,csharp
# Edit at https://www.gitignore.io/?templates=visualstudio,monodevelop,csharp

View file

@ -9,28 +9,35 @@ using Fizzler.Systems.HtmlAgilityPack;
using HtmlAgilityPack;
using Microsoft.SyndicationFeed;
using Microsoft.SyndicationFeed.Atom;
using PolyFeed.Helpers;
namespace PolyFeed
{
public class FeedBuilder
{
StringBuilder result = new StringBuilder();
MemoryStream stream = new MemoryStream();
XmlWriter xml = null;
AtomFeedWriter feed = null;
public FeedBuilder() {
xml = XmlWriter.Create(result);
feed = new AtomFeedWriter(xml);
xml = XmlWriter.Create(stream, new XmlWriterSettings() {
Indent = true,
Encoding = new UTF8Encoding(false),
WriteEndDocumentOnClose = true
});
feed = new AtomFeedWriter(xml, null, new AtomFormatter() { UseCDATA = true });
}
public async Task AddSource(FeedSource source) {
WebResponse response = await WebRequest.Create(source.Url).GetResponseAsync();
await Console.Error.WriteLineAsync("[Builder] Downloading content");
WebResponse response = await WebRequest.Create(source.Feed.Url).GetResponseAsync();
await Console.Error.WriteLineAsync("[Builder] Generating feed header");
// Write the header
await feed.WriteGenerator("Polyfeed", "https://gitlab.com/sbrl/PolyFeed.git", Program.GetProgramVersion());
await feed.WriteId(source.Url);
await feed.WriteGenerator("Polyfeed", "https://github.com/sbrl/PolyFeed.git", Program.GetProgramVersion());
await feed.WriteId(source.Feed.Url);
await feed.Write(new SyndicationLink(new Uri(source.Feed.Url), AtomLinkTypes.Self));
string lastModified = response.Headers.Get("last-modified");
if (string.IsNullOrWhiteSpace(lastModified))
await feed.WriteUpdated(DateTimeOffset.Now);
@ -39,64 +46,99 @@ namespace PolyFeed
string contentType = response.Headers.Get("content-type");
switch (source.SourceType) {
switch (source.Feed.Type) {
case SourceType.HTML:
await AddSourceHtml(source, response);
break;
default:
throw new NotImplementedException($"Error: The source type {source.SourceType} hasn't been implemented yet.");
throw new NotImplementedException($"Error: The source type {source.Feed.Type} hasn't been implemented yet.");
}
await Console.Error.WriteLineAsync("[Builder] Done!");
}
private async Task AddSourceHtml(FeedSource source, WebResponse response) {
await Console.Error.WriteLineAsync("[Builder/Html] Parsing Html");
// Parse the HTML
HtmlDocument html = new HtmlDocument();
using (StreamReader reader = new StreamReader(response.GetResponseStream()))
html.LoadHtml(await reader.ReadToEndAsync());
HtmlNode document = html.DocumentNode;
await feed.WriteTitle(ReferenceSubstitutor.Replace(source.Title, document));
await feed.WriteSubtitle(ReferenceSubstitutor.Replace(source.Subtitle, document));
foreach (HtmlNode nextNode in document.QuerySelectorAll(source.EntrySelector)) {
HtmlNode urlNode = nextNode.QuerySelector(source.EntryUrlSelector);
string url = source.EntryUrlAttribute == string.Empty ?
urlNode.InnerText : urlNode.Attributes[source.EntryUrlAttribute].DeEntitizeValue;
document.AbsolutifyUris(new Uri(source.Feed.Url));
SyndicationItem nextItem = new SyndicationItem() {
Id = url,
Title = ReferenceSubstitutor.Replace(source.EntryTitle, nextNode),
Description = ReferenceSubstitutor.Replace(source.EntryContent, nextNode)
await Console.Error.WriteLineAsync("[Builder/Html] Generating feed content");
// Add the title
await feed.WriteTitle(ReferenceSubstitutor.Replace(source.Feed.Title, document));
await feed.WriteSubtitle(ReferenceSubstitutor.Replace(source.Feed.Subtitle, document));
// Add the logo
if (source.Feed.Logo != null) {
HtmlNode logoNode = document.QuerySelector(source.Feed.Logo.Selector);
xml.WriteElementString("logo", logoNode.Attributes[source.Feed.Logo.Attribute].Value);
}
// Add the feed entries
foreach (HtmlNode nextNode in document.QuerySelectorAll(source.Entries.Selector)) {
HtmlNode urlNode = nextNode.QuerySelector(source.Entries.Url.Selector);
if (urlNode == null)
throw new ApplicationException("Error: Failed to match entry url selector against an element.");
string url = source.Entries.Url.Attribute == string.Empty ?
urlNode.InnerText : urlNode.Attributes[source.Entries.Url.Attribute].DeEntitizeValue;
Uri entryUri = new Uri(new Uri(source.Feed.Url), new Uri(url));
AtomEntry nextItem = new AtomEntry() {
Id = entryUri.ToString(),
Title = ReferenceSubstitutor.Replace(source.Entries.Title, nextNode),
Description = ReferenceSubstitutor.Replace(source.Entries.Content, nextNode),
ContentType = "html"
};
nextItem.AddLink(new SyndicationLink(entryUri, AtomLinkTypes.Alternate));
if (source.EntryPublishedSelector != string.Empty) {
HtmlNode publishedNode = nextNode.QuerySelector(source.EntryPublishedSelector);
if (source.Entries.Published != null) {
nextItem.Published = DateTime.Parse(
source.EntryPublishedAttribute == string.Empty
? publishedNode.InnerText
: publishedNode.Attributes[source.EntryPublishedAttribute].DeEntitizeValue
nextNode.QuerySelectorAttributeOrText(
source.Entries.Published
)
);
}
if (source.EntryPublishedSelector != string.Empty) {
HtmlNode lastUpdatedNode = nextNode.QuerySelector(source.EntryLastUpdatedSelector);
nextItem.Published = DateTime.Parse(
source.EntryLastUpdatedAttribute == string.Empty
? lastUpdatedNode.InnerText
: lastUpdatedNode.Attributes[source.EntryLastUpdatedAttribute].DeEntitizeValue
if (source.Entries.Published != null) {
nextItem.LastUpdated = DateTime.Parse(
nextNode.QuerySelectorAttributeOrText(
source.Entries.LastUpdated
)
);
}
else // It requires one, apparently
nextItem.LastUpdated = DateTimeOffset.Now;
SyndicationPerson author = new SyndicationPerson(
nextNode.QuerySelectorAttributeOrText(source.Entries.AuthorName).Trim(),
""
);
if(source.Entries.AuthorUrl != null)
author.Uri = nextNode.QuerySelectorAttributeOrText(source.Entries.AuthorUrl);
nextItem.AddContributor(author);
await feed.Write(nextItem);
}
}
public string Render()
public async Task<string> Render()
{
xml.Flush();
await feed.Flush();
xml.WriteEndDocument();
return result.ToString();
xml.Flush();
xml.Close();
return Encoding.UTF8.GetString(stream.ToArray());
}
}
}

View file

@ -4,8 +4,29 @@ namespace PolyFeed
{
public enum SourceType { HTML, XML, JSON };
public class FeedSource
public class SelectorSettings
{
/// <summary>
/// A selector that matches against an element to select.
/// </summary>
public string Selector { get; set; }
/// <summary>
/// The name of the attribute to get the value of.
/// Set to an empty string to select the content of the element instead of the
/// content of an attribute.
/// </summary>
public string Attribute { get; set; }
public override string ToString()
{
return $"[SelectorSettings Selector = {Selector}, Attribute = {Attribute}]";
}
}
public class FeedSettings
{
public string Output { get; set; }
/// <summary>
/// The url of the source document to parse.
/// </summary>
@ -15,7 +36,9 @@ namespace PolyFeed
/// <summary>
/// The type of source document to expect.
/// </summary>
public SourceType SourceType { get; set; }
public string SourceType { get; set; }
public SourceType Type => (SourceType)Enum.Parse(typeof(SourceType), SourceType, true);
/// <summary>
/// The title of the feed.
@ -29,22 +52,14 @@ namespace PolyFeed
/// <value>The subtitle.</value>
public string Subtitle { get; set; }
#region Entries
/// <summary>
/// A selector that matches against an element that contains the URL that an
/// entry should link to.
/// Relative to the element selected by <see cref="EntrySelector" />.
/// Selector that matches against the feed logo url.
/// </summary>
public string EntryUrlSelector { get; set; }
/// <summary>
/// The name of the attribute on the element selected by <see cref="EntryUrlSelector" />.
/// Set to an empty string to select the content of the element instead of the
/// content of an attribute.
/// </summary>
public string EntryUrlAttribute { get; set; } = "";
public SelectorSettings Logo { get; set; }
}
public class EntrySettings
{
/// <summary>
/// The selector that specifies the location of nodes in the object model that
/// should be added to the feed.
@ -53,41 +68,42 @@ namespace PolyFeed
/// - XML: XPath (e.g. //element_name)
/// - JSON: Dotted object (e.g. items.fruit)
/// </summary>
public string EntrySelector { get; set; }
public string Selector { get; set; }
/// <summary>
/// Selector settings to get the URL that an entry should link to.
/// </summary>
public SelectorSettings Url { get; set; } = new SelectorSettings() { Attribute = "href" };
/// <summary>
/// The title of an entry.
/// Selectors may be included in curly braces {} to substitute in content.
/// Such selectors are relative to the current feed entry.
/// The format varies in the samem way as <see cref="EntrySelector" /> does.
/// The format varies in the same way as <see cref="Selector" /> does.
/// </summary>
public string EntryTitle { get; set; }
public string Title { get; set; }
/// <summary>
/// Same as <see cref="EntryTitle" />, but for the body of an entry. HTML is allowed.
/// Same as <see cref="Title" />, but for the body of an entry. HTML is allowed.
/// </summary>
public string EntryContent { get; set; }
public string Content { get; set; }
/// <summary>
/// The selector for the node that contains the date published for an entry.
/// The selector for the date published for an entry.
/// </summary>
public string EntryPublishedSelector { get; set; }
public SelectorSettings Published { get; set; }
/// <summary>
/// The name of the attribute that contains the date published for an entry.
/// Set to <see cref="string.Empty" /> to use the content of the node itself.
/// The selector for the date published for an entry.
/// </summary>
public string EntryPublishedAttribute { get; set; }
public SelectorSettings LastUpdated { get; set; }
/// <summary>
/// Same as <see cref="EntryPublishedSelector" />, but for the last updated.
/// If not specified, the last updated will be omitted.
/// </summary>
public string EntryLastUpdatedSelector { get; set; }
/// <summary>
/// Same as <see cref="EntryPublishedAttribute" />.
/// </summary>
public string EntryLastUpdatedAttribute { get; set; }
#endregion
public SelectorSettings AuthorName { get; set; }
public SelectorSettings AuthorUrl { get; set; }
}
public class FeedSource
{
public FeedSettings Feed { get; set; }
public EntrySettings Entries { get; set; }
}
}

View file

@ -0,0 +1,64 @@
using System;
using System.Threading;
using System.Threading.Tasks;
using Fizzler.Systems.HtmlAgilityPack;
using HtmlAgilityPack;
namespace PolyFeed.Helpers
{
public static class HtmlHelpers
{
public static string QuerySelectorAttributeOrText(this HtmlNode htmlNode, SelectorSettings settings)
{
HtmlNode selectedNode = htmlNode.QuerySelector(settings.Selector);
if (selectedNode == null)
throw new ApplicationException($"Error: Selector {settings.Selector} failed to find any elements.");
if (string.IsNullOrWhiteSpace(settings.Attribute))
return selectedNode.InnerText;
return selectedNode.Attributes[settings.Attribute].Value;
}
public static string QuerySelectorAttributeOrHtml(this HtmlNode htmlNode, SelectorSettings settings)
{
HtmlNode selectedNode = htmlNode.QuerySelector(settings.Selector);
if (selectedNode == null)
throw new ApplicationException($"Error: Selector {settings.Selector} failed to find any elements.");
if (string.IsNullOrWhiteSpace(settings.Attribute))
return selectedNode.InnerHtml;
return selectedNode.Attributes[settings.Attribute].Value;
}
/// <summary>
/// Searches for and converts all the links that are children of the current
/// <see cref="HtmlNode" /> to absolute URIs.
/// </summary>
/// <param name="rootNode">The root node to search from.</param>
/// <param name="baseUri">The base URI to use for conversion.</param>
/// <returns>The number of nodes updated.</returns>
public static int AbsolutifyUris(this HtmlNode rootNode, Uri baseUri)
{
int nodesUpdated = 0;
Parallel.ForEach(rootNode.QuerySelectorAll("a, img"), (HtmlNode node) => {
string attributeName = null;
if (node.Attributes["href"] != null) attributeName = "href";
if (node.Attributes["src"] != null) attributeName = "src";
if (node.Attributes[attributeName] == null)
return;
node.Attributes[attributeName].Value = new Uri(
baseUri,
node.Attributes[attributeName].Value
).ToString();
Interlocked.Increment(ref nodesUpdated);
});
return nodesUpdated;
}
}
}

View file

@ -147,12 +147,15 @@
<Compile Include="SubstitutionLexer.cs" />
<Compile Include="Salamander.Core\LexerPool.cs" />
<Compile Include="ReferenceSubstitutor.cs" />
<Compile Include="SnakeCasePropertySelector.cs" />
<Compile Include="Helpers\HtmlHelpers.cs" />
</ItemGroup>
<ItemGroup>
<None Include="packages.config" />
</ItemGroup>
<ItemGroup>
<Folder Include="Salamander.Core\" />
<Folder Include="Helpers\" />
</ItemGroup>
<Import Project="$(MSBuildBinPath)\Microsoft.CSharp.targets" />
<Import Project="..\packages\NETStandard.Library.2.0.3\build\netstandard2.0\NETStandard.Library.targets" Condition="Exists('..\packages\NETStandard.Library.2.0.3\build\netstandard2.0\NETStandard.Library.targets')" />

View file

@ -13,7 +13,7 @@ namespace PolyFeed
public readonly string ProgramName = "PolyFeed";
public readonly string Description = "creates Atom feeds from websites that don't support it";
public string ConfigFilepath = "feed.toml";
public string ConfigFilepath = null;
public string OutputFilepath = "feed.atom";
}
@ -38,20 +38,7 @@ namespace PolyFeed
{
case "-h":
case "--help":
Console.WriteLine($"{settings.ProgramName}, {GetProgramVersion()}");
Console.WriteLine(" By Starbeamrainbowlabs");
Console.WriteLine();
Console.WriteLine($"This program {settings.Description}.");
Console.WriteLine();
Console.WriteLine("Usage:");
Console.WriteLine($" ./{Path.GetFileName(Assembly.GetExecutingAssembly().Location)} [arguments]");
Console.WriteLine();
Console.WriteLine("Options:");
Console.WriteLine(" -h --help Displays this message");
Console.WriteLine(" -v --version Outputs the version number of this program");
Console.WriteLine(" -c --config Specifies the location of the feed configuration file to use to generate a feed (default: feed.toml)");
Console.WriteLine(" -o --output Specifies the location to write the output feed to (default: feed.atom)");
showHelp();
return 0;
case "-v":
@ -71,37 +58,64 @@ namespace PolyFeed
}
}
if (settings.ConfigFilepath == null) {
Console.Error.WriteLine("Error: No configuration filepath detected. Try " +
"using --help to show usage information.");
return 1;
}
///// 2: Acquire environment variables /////
///// 3: Run program /////
return 0;
return run().Result;
}
private static async Task<string> run()
private static void showHelp()
{
FeedSource feedSource = new FeedSource();
TomlTable config = Toml.ReadFile(settings.ConfigFilepath, TomlSettings.Create());
Console.WriteLine($"{settings.ProgramName}, {GetProgramVersion()}");
Console.WriteLine(" By Starbeamrainbowlabs");
foreach (KeyValuePair<string, TomlObject> item in config) {
string key = Regex.Replace(
item.Key,
@"(^|_)[A-Za-z0-9]",
(match) => match.Value.Replace("_", "").ToUpper()
);
string value = item.Value.Get<TomlString>().Value;
feedSource.GetType().GetProperty(value).SetValue(
feedSource,
value
);
Console.WriteLine();
Console.WriteLine($"This program {settings.Description}.");
Console.WriteLine();
Console.WriteLine("Usage:");
Console.WriteLine($" ./{Path.GetFileName(Assembly.GetExecutingAssembly().Location)} [arguments]");
Console.WriteLine();
Console.WriteLine("Options:");
Console.WriteLine(" -h --help Displays this message");
Console.WriteLine(" -v --version Outputs the version number of this program");
Console.WriteLine(" -c --config Specifies the location of the TOML feed configuration file to use to generate a feed");
Console.WriteLine(" -o --output Specifies the location to write the output feed to (default: feed.atom)");
}
private static async Task<int> run()
{
TomlSettings parseSettings = TomlSettings.Create(s =>
s.ConfigurePropertyMapping(m => m.UseTargetPropertySelector(new SnakeCasePropertySelector()))
);
FeedSource feedSource = Toml.ReadFile<FeedSource>(settings.ConfigFilepath, parseSettings);
if (feedSource == null) {
Console.Error.WriteLine("Error: Somethine went wrong when parsing your settings file :-(");
return 1;
}
if (!string.IsNullOrWhiteSpace(feedSource.Feed.Output))
settings.OutputFilepath = feedSource.Feed.Output;
FeedBuilder feedBuilder = new FeedBuilder();
try {
await feedBuilder.AddSource(feedSource);
return await feedBuilder.Render();
} catch (ApplicationException error) {
Console.Error.WriteLine(error.Message);
return 2;
}
await Console.Error.WriteLineAsync($"[Output] Writing feed to {settings.OutputFilepath}");
File.WriteAllText(settings.OutputFilepath, await feedBuilder.Render());
return 0;
}

View file

@ -15,10 +15,14 @@ namespace PolyFeed
SubstitutionLexer lexer = lexerPool.AcquireLexer();
lexer.Initialise(inputString);
bool useHtml = true;
foreach (LexerToken<SubstitutionToken> nextToken in lexer.TokenStream())
{
switch (nextToken.Type) {
case SubstitutionToken.BraceOpen:
useHtml = nextToken.Value.Length == 1;
lexer.SaveRuleStates();
lexer.EnableRule(SubstitutionToken.Identifier);
lexer.DisableRule(SubstitutionToken.Text);
@ -32,7 +36,12 @@ namespace PolyFeed
break;
case SubstitutionToken.Identifier:
result.Append(rootElement.QuerySelector(nextToken.Value));
HtmlNode targetNode = rootElement.QuerySelector(nextToken.Value);
if (targetNode == null) {
Console.Error.WriteLine($"Warning: Selector {nextToken.Value} failed to match any elements");
break;
}
result.Append(useHtml ? targetNode.InnerHtml : targetNode.InnerText);
break;
}
}

View file

@ -0,0 +1,27 @@
using System;
using System.Reflection;
using System.Text.RegularExpressions;
using Nett;
namespace PolyFeed
{
public class SnakeCasePropertySelector : ITargetPropertySelector
{
public SnakeCasePropertySelector()
{
}
public PropertyInfo TryGetTargetProperty(string key, Type target)
{
string transformedKey = Regex.Replace(
key,
@"(^|_)[A-Za-z0-9]",
(match) => match.Value.Replace("_", "").ToUpper()
);
//Console.WriteLine($"{key} -> {transformedKey}");
return target.GetProperty(transformedKey);
}
}
}

View file

@ -24,8 +24,8 @@ namespace PolyFeed
AddRules(new List<LexerRule<SubstitutionToken>>() {
new LexerRule<SubstitutionToken>(SubstitutionToken.Text, @"[^{}]+"),
new LexerRule<SubstitutionToken>(SubstitutionToken.Identifier, @"[^{}]+"),
new LexerRule<SubstitutionToken>(SubstitutionToken.BraceOpen, @"\{"),
new LexerRule<SubstitutionToken>(SubstitutionToken.BraceClose, @"\}"),
new LexerRule<SubstitutionToken>(SubstitutionToken.BraceOpen, @"\{+"),
new LexerRule<SubstitutionToken>(SubstitutionToken.BraceClose, @"\}+"),
});
}

View file

@ -2,4 +2,43 @@
> Create Atom feeds for websites that don't support it
Currently in alpha.
PolyFeed generates Atom feeds out of websites that don't have one, such as _Twitter_ or _Facebook_ (* cough * * cough *). It supports any platform that C&sharp; .NET applications can run, including Linux and Windows.
## Install
### From a Release
Download and extract the [latest release](https://github.com/sbrl/PolyFeed/releases/latest). You're done!
### Building from Source
Clone this repository, and then build the code with `msbuild`:
```bash
msbuild /p:Configuration=Release
```
The build output will be outputted to `PolyFeed/bin/Release`.
## Usage
PolyFeed uses [TOML](https://github.com/toml-lang/toml) configuration files to define Atom feeds. First, create a configuration file that specifies how PolyFeed should generate an Atom feed - or use [one of the examples](https://github.com/sbrl/PolyFeed/tree/master/examples).
Then, run PolyFeed over it:
```bash
path/to/PolyFeed.exe --config path/to/config.toml
```
...it will generate the named `.atom` file automatically, keeping you up-to-date on it's progress and any errors it encounters.
Use `PolyFeed.exe --help` to display the full range of command-line flags available.
## Contributing
Contributions are welcome - feel free to [open an issue](https://github.com/sbrl/PolyFeed/issues/new) or (even better) a [pull request](https://github.com/sbrl/PolyFeed/compare).
The [issue tracker](https://github.com/sbrl/PolyFeed/issues) is the place where all the tasks relating to the project are kept.
## Licence
PolyFeed is released under the _Mozilla Public License 2.0_. The full license text is included in the `LICENSE` file in this repository.

23
examples/twitter.toml Normal file
View file

@ -0,0 +1,23 @@
[feed]
output = "euruicimages-Twitter.atom"
url = "https://mobile.twitter.com/euruicimages"
source_type = "html"
title = "{{.username}} on Twitter"
subtitle = "{{.details}}"
logo_url = { selector = ".avatar img", attribute = "src" }
[entries]
selector = ".tweet"
title = "Tweet by {{.username}} {{.tweet-social-context}}"
content = "<p><strong>{.avatar}{.fullname}:</strong></p>\n{.tweet-text}"
url = { selector = ".metadata a", attribute = "href" }
author_name = { selector = ".username" }
# published = { selector = "", attribute = "" }
# last_updated = { selector = "", attribute = "" }