Skip to content
Merged
Show file tree
Hide file tree
Changes from 137 commits
Commits
Show all changes
142 commits
Select commit Hold shift + click to select a range
e314528
Add ContentExtractorStrategy
elzik May 25, 2025
9001db8
Add additional code coverage
elzik May 25, 2025
540d466
Use ContentExtractorStrategy with only default extractor
elzik May 25, 2025
faa20b6
Give default content extractor a better name
elzik May 25, 2025
23f466a
Move content extractors to their own namespace
elzik May 25, 2025
258b8b5
Supress warnings needed for tests
elzik May 25, 2025
8d3de23
Fix test not using cirrect instance
elzik May 25, 2025
0c1e8ec
Rename tests & adjust namespaces to match class being tested
elzik May 25, 2025
cadae36
Remove repeated test
elzik May 25, 2025
3c2eef3
Add CanHandle tests
elzik May 25, 2025
f4b3630
Add partial SubRedditContentExtractor
elzik May 25, 2025
41665fb
Rename WebPageDownloader - it downloads any text not just web pages
elzik May 25, 2025
5675b09
Rename test files to match class names
elzik May 26, 2025
7f043d4
Add TryGet to HttpDownloader
elzik May 26, 2025
52dc3f8
Complete SubRedditContentExtractor.ExtractAsync implementation
elzik May 26, 2025
f0d9bf7
Ignore local test playlists
elzik May 26, 2025
464b781
Refine SubRedditExtractorTests
elzik May 26, 2025
b887a80
Ensure sub-reddit URLs are genrated regardelss of whether they have a…
elzik May 26, 2025
cb48db3
Log strategy used
elzik May 26, 2025
789952d
Make SubRedditContentExtractor available
elzik May 26, 2025
6f86a1b
Move Reddit concerns to its own namespace and fix image extraction
elzik May 26, 2025
c46341b
Use Shouldly for asserts
elzik May 26, 2025
96fd565
Add client for posts new in a Subreddit
elzik May 26, 2025
2353cd2
Code quality fixes
elzik May 26, 2025
5cc945e
Add abour subreddit to reddit client
elzik May 28, 2025
90241c3
Initial reddit posts client
elzik May 28, 2025
c7af966
Skip reddit tests which cannot run in CI
elzik May 28, 2025
5efc53b
Ensure all reddit-based tests are skipped
elzik May 29, 2025
c3bc746
Refine RedditPostClient and assert main post os correct
elzik Jun 10, 2025
2bf045c
Increae RedditPostClient test coverage
elzik Jun 28, 2025
4080d57
Code quality fixes
elzik Jun 28, 2025
ea61877
Upgrade Sonar & fix code quality issues
elzik Jun 28, 2025
8355024
Code quality fixes
elzik Jul 4, 2025
fe47956
Simply array indexing
elzik Jul 6, 2025
77e1936
Make converter redditspecific by writing in the same format as reddit…
elzik Jul 6, 2025
c0e0f44
Add RedditDateTimeConverter tests
elzik Jul 6, 2025
379e8f1
Code quality fixes
elzik Jul 6, 2025
729ab1a
Initial tests for RedditRepliesConverter
elzik Jul 11, 2025
a4b8a4e
Fix failing tests
elzik Jul 11, 2025
7a86789
Merge branch 'main' into summarise-sub-reddit
elzik Sep 18, 2025
e95a89a
Merge branch 'main' into summarise-sub-reddit
elzik Sep 19, 2025
e0d777a
Make test less brittle and account for possible post deletion in the …
elzik Sep 20, 2025
82114ec
Add simplified version of reddit post and client
elzik Sep 21, 2025
b65ae9a
Move reply converstion into transformer
elzik Sep 21, 2025
d6876f4
Return Tasks for async methods
elzik Sep 21, 2025
c1d632b
Add NSubstitute analysers
elzik Sep 21, 2025
3041b77
Don’t await NSubstitute Received verification
elzik Sep 21, 2025
fb396b4
Fix culture-unsafe formatting and unnecessary WriteRawValue
elzik Sep 21, 2025
abbfe23
Avoid local-time skew when value.Kind is Unspecified
elzik Sep 21, 2025
10b02b8
Add explicit using for Reddit.Client and remove the redundant self-na…
elzik Sep 21, 2025
6df7ced
Fix locale‑dependent JSON construction for doubles
elzik Sep 21, 2025
02822ea
Merge branch 'summarise-sub-reddit' of https://github.com/elzik/breef…
elzik Sep 21, 2025
68f60e8
Ensure that tests fail is the wrong extractor is used
elzik Sep 24, 2025
d7dd68e
Add guard against Children being null
elzik Sep 24, 2025
321a49f
Avoid using ThrowsAsync for throwing exeptions from mocks
elzik Sep 24, 2025
d89594d
Add a null-guard for rawRedditPost
elzik Sep 25, 2025
d734a12
Add ContentExtractorStrategy
elzik May 25, 2025
03eef6c
Add additional code coverage
elzik May 25, 2025
02cdb7d
Use ContentExtractorStrategy with only default extractor
elzik May 25, 2025
a923a1f
Give default content extractor a better name
elzik May 25, 2025
f40cef2
Move content extractors to their own namespace
elzik May 25, 2025
fec3b83
Supress warnings needed for tests
elzik May 25, 2025
a355743
Fix test not using cirrect instance
elzik May 25, 2025
45f06a4
Rename tests & adjust namespaces to match class being tested
elzik May 25, 2025
380b386
Remove repeated test
elzik May 25, 2025
fe67ad0
Add CanHandle tests
elzik May 25, 2025
0dd1fa2
Add partial SubRedditContentExtractor
elzik May 25, 2025
5a61dd0
Rename WebPageDownloader - it downloads any text not just web pages
elzik May 25, 2025
0109efc
Rename test files to match class names
elzik May 26, 2025
c39d84f
Add TryGet to HttpDownloader
elzik May 26, 2025
52578e7
Complete SubRedditContentExtractor.ExtractAsync implementation
elzik May 26, 2025
00acc98
Ignore local test playlists
elzik May 26, 2025
cb60afa
Refine SubRedditExtractorTests
elzik May 26, 2025
18c49e7
Ensure sub-reddit URLs are genrated regardelss of whether they have a…
elzik May 26, 2025
3bfd2c9
Log strategy used
elzik May 26, 2025
6d8813e
Make SubRedditContentExtractor available
elzik May 26, 2025
a90404b
Move Reddit concerns to its own namespace and fix image extraction
elzik May 26, 2025
d1329ac
Use Shouldly for asserts
elzik May 26, 2025
261b65a
Add client for posts new in a Subreddit
elzik May 26, 2025
a8b53fd
Code quality fixes
elzik May 26, 2025
c6c1890
Add abour subreddit to reddit client
elzik May 28, 2025
85f4555
Initial reddit posts client
elzik May 28, 2025
f5ca510
Skip reddit tests which cannot run in CI
elzik May 28, 2025
5bb90ad
Ensure all reddit-based tests are skipped
elzik May 29, 2025
db08ecf
Refine RedditPostClient and assert main post os correct
elzik Jun 10, 2025
79211f5
Increae RedditPostClient test coverage
elzik Jun 28, 2025
6387832
Code quality fixes
elzik Jun 28, 2025
e307205
Upgrade Sonar & fix code quality issues
elzik Jun 28, 2025
f068c1a
Code quality fixes
elzik Jul 4, 2025
4e234cc
Simply array indexing
elzik Jul 6, 2025
2051b29
Make converter redditspecific by writing in the same format as reddit…
elzik Jul 6, 2025
58d1f48
Add RedditDateTimeConverter tests
elzik Jul 6, 2025
4cb938f
Code quality fixes
elzik Jul 6, 2025
dbd62d8
Initial tests for RedditRepliesConverter
elzik Jul 11, 2025
5a600d7
Fix failing tests
elzik Jul 11, 2025
d346935
Make test less brittle and account for possible post deletion in the …
elzik Sep 20, 2025
5e8dd64
Add simplified version of reddit post and client
elzik Sep 21, 2025
27c551b
Move reply converstion into transformer
elzik Sep 21, 2025
b7c4819
Return Tasks for async methods
elzik Sep 21, 2025
6f83d35
Add NSubstitute analysers
elzik Sep 21, 2025
181d345
Don’t await NSubstitute Received verification
elzik Sep 21, 2025
e88e44c
Fix culture-unsafe formatting and unnecessary WriteRawValue
elzik Sep 21, 2025
8eba336
Avoid local-time skew when value.Kind is Unspecified
elzik Sep 21, 2025
ea5fb57
Fix locale‑dependent JSON construction for doubles
elzik Sep 21, 2025
999fc80
Add explicit using for Reddit.Client and remove the redundant self-na…
elzik Sep 21, 2025
767b2d9
Ensure that tests fail is the wrong extractor is used
elzik Sep 24, 2025
c97ea9b
Add guard against Children being null
elzik Sep 24, 2025
1046d47
Avoid using ThrowsAsync for throwing exeptions from mocks
elzik Sep 24, 2025
fc76c52
Add a null-guard for rawRedditPost
elzik Sep 25, 2025
268b76d
Merge branch 'summarise-sub-reddit' of https://github.com/elzik/breef…
elzik Oct 1, 2025
3e74dfc
Merge branch 'main' into summarise-sub-reddit
elzik Oct 1, 2025
42751f4
Update tests/Elzik.Breef.Infrastructure.Tests.Unit/ContentExtractors/…
elzik Oct 1, 2025
c029f7d
Update tests/Elzik.Breef.Infrastructure.Tests.Integration/ContentExtr…
elzik Oct 1, 2025
f3a5bbe
Update tests/Elzik.Breef.Infrastructure.Tests.Unit/ContentExtractors/…
elzik Oct 1, 2025
8909784
Upgrade Sonar analysers
elzik Oct 2, 2025
ac24e98
Merge branch 'main' into summarise-sub-reddit
elzik Oct 3, 2025
61e419e
Add content extractor for reddit posts
elzik Oct 6, 2025
4b87fce
Improve tests
elzik Oct 7, 2025
5875c2c
Code quality fixes
elzik Oct 7, 2025
7696ed0
Add ability to configyre Reddit URLs and code quality fixes
elzik Oct 7, 2025
9673297
Code quality fixes
elzik Oct 8, 2025
0b5ccf2
Ensure that imgae fallback takes place in all cases
elzik Oct 9, 2025
da1782f
Code quality fixes
elzik Oct 9, 2025
f9a944c
Remove tests that do not test anything
elzik Oct 9, 2025
22dbad3
Add timeout to Docker tests
elzik Oct 9, 2025
1a6d54a
Code quality fixes
elzik Oct 9, 2025
cd2c236
Remove unecessary usings
elzik Oct 9, 2025
32406de
Code quality fixes
elzik Oct 9, 2025
2a1c6d5
Make reddit fallback image configurable
elzik Oct 9, 2025
3eb41f2
Rename existing types to Raw pattern
elzik Oct 9, 2025
bcefc60
Create new domain NewInSubreddit type
elzik Oct 9, 2025
139dd4e
Create transformer interface and implementation
elzik Oct 9, 2025
3ec3329
Create new SubredditClient following the established pattern
elzik Oct 11, 2025
7615d11
Update SubRedditContentExtractor to use new client
elzik Oct 22, 2025
66cf3de
Code quality fixes
elzik Oct 22, 2025
36edc81
Merge branch 'main' into summarise-sub-reddit
elzik Oct 23, 2025
0c613f9
Remove unecessary wrapping of HttpClient
elzik Oct 23, 2025
10b12c3
Fix readme indentation
elzik Oct 23, 2025
d9bc281
Ensure HTTP client is disposed in tests
elzik Oct 23, 2025
10702f7
Fix indentation
elzik Oct 24, 2025
49ae929
Dispose of HttpResponseMessage to prevent resource leak
elzik Oct 24, 2025
245cc26
Add URL with query string test
elzik Oct 24, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -396,3 +396,4 @@ FodyWeavers.xsd

# JetBrains Rider
*.sln.iml
/tests/LocalPlaylists
1 change: 1 addition & 0 deletions Elzik.Breef.sln
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "TestData", "TestData", "{7F
ProjectSection(SolutionItems) = preProject
tests\TestData\BbcNewsPage-ExpectedContent.txt = tests\TestData\BbcNewsPage-ExpectedContent.txt
tests\TestData\BbcNewsPage.html = tests\TestData\BbcNewsPage.html
tests\TestData\SampleRedditPost-1kqiwzc.json = tests\TestData\SampleRedditPost-1kqiwzc.json
tests\TestData\StaticTestPage.html = tests\TestData\StaticTestPage.html
tests\TestData\TestHtmlPage-ExpectedContent.txt = tests\TestData\TestHtmlPage-ExpectedContent.txt
tests\TestData\TestHtmlPage.html = tests\TestData\TestHtmlPage.html
Expand Down
37 changes: 33 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,34 @@ Example

### Optional

#### Reddit

These config items relate to the Reddit integration using the Options pattern with support for multiple Reddit instances.

- **DefaultBaseAddress** - The primary base address for Reddit API requests. Default: `"https://www.reddit.com"`. Must be a valid URL. Used for Refit HTTP client configuration, fallback subreddit image extraction, and primary Reddit instance for content extraction.
- **AdditionalBaseAddresses** - Additional Reddit instances that the content extractors can handle. Default: `["https://reddit.com"]` (includes non-www variant by default). Domain matching is **exact** - if you want to support both `reddit.com` and `www.reddit.com`, you must explicitly configure both.
- **FallbackImageUrl** - The fallback image URL used when subreddit-specific images cannot be retrieved. Default: `"https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Lockup_Logo.svg"`. This URL is used as the default Reddit logo when no subreddit banner, icon, or community image is available.

The Reddit integration allows extraction of content from:
- Custom Reddit instances
- Alternative Reddit domains
- Corporate or self-hosted Reddit installations
- Specific subdomains (e.g., `old.reddit.com`, `api.reddit.com`)

**Domain Validation**: The content extractors validate URLs using **exact domain matching**. `reddit.com` does NOT automatically allow `www.reddit.com` - each domain variant must be explicitly configured.

Example:

```jsonc
"Reddit": {
"DefaultBaseAddress": "https://www.reddit.com", // breef_Reddit__DefaultBaseAddress
"AdditionalBaseAddresses": [ // breef_Reddit__AdditionalBaseAddresses__0
"https://reddit.com", // breef_Reddit__AdditionalBaseAddresses__0
],
"FallbackImageUrl": "https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Lockup_Logo.svg" // breef_Reddit__FallbackImageUrl
}
```

#### AiService

- **TimeOut** - Sets the number of seconds before the AiService used will time out. The default used if not set is 100 seconds. This may need to be increased where Ollama is used with limiting hardware.
Expand Down Expand Up @@ -113,12 +141,14 @@ Example:
These settings affect how pages are downloaded prior to being summarised.

- **UserAgent** - The user agent used when downloading pages. By default this is set to `Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36` but can be overridden here.
- **TimeoutSeconds** - The timeout in seconds for HTTP requests when downloading pages. By default this is set to `30` seconds but can be overridden here. Must be at least 1 second.

Example:

```jsonc
"WebPageDownLoader" : {
"UserAgent": "<custom-agent>" // breef_WebPageDownLoader__UserAgent
"HttpClient" : {
"UserAgent": "<custom-agent>", // breef_HttpClient__UserAgent
"TimeoutSeconds": 30 // breef_HttpClient__TimeoutSeconds
}
```

Expand All @@ -131,5 +161,4 @@ Logging is handled by Serilog and configuration is documented [here](https://git
"MinimumLevel": {
"Default": "Debug" // breef_Serilog__MinimumLevel__Default
}
}
```
}
2 changes: 1 addition & 1 deletion src/Elzik.Breef.Api/Elzik.Breef.Api.http
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@ Post {{Elzik.Breef.Api_HostAddress}}/breefs
Content-Type: application/json
BREEF-API-KEY: test-key
{
"url":"https://www.bbc.co.uk/news/articles/cdedkr9439wo"
"url":"https://www.reddit.com/r/dotnet/comments/1o0j6or/im_giving_up_on_copilot_i_spend_more_time/"
}
60 changes: 52 additions & 8 deletions src/Elzik.Breef.Api/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,15 @@
using Elzik.Breef.Domain;
using Elzik.Breef.Infrastructure;
using Elzik.Breef.Infrastructure.AI;
using Elzik.Breef.Infrastructure.ContentExtractors;
using Elzik.Breef.Infrastructure.ContentExtractors.Reddit;
using Elzik.Breef.Infrastructure.ContentExtractors.Reddit.Client;
using Elzik.Breef.Infrastructure.ContentExtractors.Reddit.Client.Raw;
using Elzik.Breef.Infrastructure.Wallabag;
using Microsoft.Extensions.Options;
using Microsoft.SemanticKernel;
using Microsoft.SemanticKernel.ChatCompletion;
using Refit;
using Serilog;
using System.Reflection;
using System.Text.Json;
using System.Text.Json.Serialization;

namespace Elzik.Breef.Api;

Expand Down Expand Up @@ -49,7 +49,7 @@
{
options.AddDefaultPolicy(builder =>
{
builder.AllowAnyOrigin()

Check warning on line 52 in src/Elzik.Breef.Api/Program.cs

View workflow job for this annotation

GitHub Actions / build-ubuntu

Make sure this permissive CORS policy is safe here. (https://rules.sonarsource.com/csharp/RSPEC-5122)

Check warning on line 52 in src/Elzik.Breef.Api/Program.cs

View workflow job for this annotation

GitHub Actions / build-ubuntu

Make sure this permissive CORS policy is safe here. (https://rules.sonarsource.com/csharp/RSPEC-5122)

Check warning on line 52 in src/Elzik.Breef.Api/Program.cs

View workflow job for this annotation

GitHub Actions / build-ubuntu

Make sure this permissive CORS policy is safe here. (https://rules.sonarsource.com/csharp/RSPEC-5122)

Check warning on line 52 in src/Elzik.Breef.Api/Program.cs

View workflow job for this annotation

GitHub Actions / build-ubuntu

Make sure this permissive CORS policy is safe here. (https://rules.sonarsource.com/csharp/RSPEC-5122)
.AllowAnyMethod()
.AllowAnyHeader();
});
Expand All @@ -61,13 +61,57 @@
.ValidateOnStart();
builder.Services.AddAuth();

builder.Services.AddOptions<WebPageDownLoaderOptions>()
.Bind(configuration.GetSection("WebPageDownLoader"))
builder.Services.AddOptions<HttpClientOptions>()
.Bind(configuration.GetSection("HttpClient"))
.ValidateDataAnnotations()
.ValidateOnStart();
builder.Services.AddTransient<IWebPageDownloader, WebPageDownloader>();

builder.Services.AddTransient<IContentExtractor, ContentExtractor>();
builder.Services.AddHttpClient("BreefDownloader")
.ConfigureHttpClient((provider, client) =>
{
var httpClientOptions = provider.GetRequiredService<IOptions<HttpClientOptions>>().Value;
client.Timeout = TimeSpan.FromSeconds(httpClientOptions.TimeoutSeconds);
client.DefaultRequestHeaders.Add("User-Agent", httpClientOptions.UserAgent);
});

builder.Services.AddOptions<RedditOptions>()
.Bind(configuration.GetSection("Reddit"))
.ValidateDataAnnotations()
.ValidateOnStart();

builder.Services.AddRefitClient<IRawRedditPostClient>()
.ConfigureHttpClient((provider, client) =>
{
var redditOptions = provider.GetRequiredService<IOptions<RedditOptions>>().Value;
client.BaseAddress = new Uri(redditOptions.DefaultBaseAddress);
});

builder.Services.AddRefitClient<IRawSubredditClient>()
.ConfigureHttpClient((provider, client) =>
{
var redditOptions = provider.GetRequiredService<IOptions<RedditOptions>>().Value;
client.BaseAddress = new Uri(redditOptions.DefaultBaseAddress);
});

builder.Services.AddTransient<IRawRedditPostTransformer, RawRedditPostTransformer>();
builder.Services.AddTransient<IRedditPostClient, RedditPostClient>();
builder.Services.AddTransient<IRawNewInSubredditTransformer, RawNewInSubredditTransformer>();
builder.Services.AddTransient<ISubredditClient, SubredditClient>();

builder.Services.AddTransient<HtmlContentExtractor>();
builder.Services.AddTransient<SubredditContentExtractor>();
builder.Services.AddTransient<RedditPostContentExtractor>();
builder.Services.AddTransient<ISubredditImageExtractor, SubredditContentExtractor>();
builder.Services.AddTransient<IContentExtractor>(provider =>
{
var logger = provider.GetRequiredService<ILogger<ContentExtractorStrategy>>();
var defaultContentExtractor = provider.GetRequiredService<HtmlContentExtractor>();
var subredditExtractor = provider.GetRequiredService<SubredditContentExtractor>();
var redditPostExtractor = provider.GetRequiredService<RedditPostContentExtractor>();
return new ContentExtractorStrategy(logger,
[subredditExtractor, redditPostExtractor],
defaultContentExtractor);
});

builder.Services.AddOptions<AiServiceOptions>()
.Bind(configuration.GetSection("AiService"))
Expand Down
1 change: 0 additions & 1 deletion src/Elzik.Breef.Application/BreefGenerator.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
using Elzik.Breef.Domain;
using System.Diagnostics;

namespace Elzik.Breef.Application
{
Expand Down
7 changes: 7 additions & 0 deletions src/Elzik.Breef.Application/Elzik.Breef.Application.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,13 @@
<Nullable>enable</Nullable>
</PropertyGroup>

<ItemGroup>
<PackageReference Include="SonarAnalyzer.CSharp" Version="10.15.0.120848">
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
</PackageReference>
</ItemGroup>

<ItemGroup>
<ProjectReference Include="..\Elzik.Breef.Domain\Elzik.Breef.Domain.csproj" />
</ItemGroup>
Expand Down
6 changes: 5 additions & 1 deletion src/Elzik.Breef.Domain/Elzik.Breef.Domain.csproj
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<Project Sdk="Microsoft.NET.Sdk">
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>net8.0</TargetFramework>
Expand All @@ -9,6 +9,10 @@
<ItemGroup>
<PackageReference Include="HtmlAgilityPack" Version="1.12.4" />
<PackageReference Include="Microsoft.SemanticKernel.Abstractions" Version="1.65.0" />
<PackageReference Include="SonarAnalyzer.CSharp" Version="10.15.0.120848">
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
</PackageReference>
</ItemGroup>

</Project>
2 changes: 2 additions & 0 deletions src/Elzik.Breef.Domain/IContentExtractor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
{
public interface IContentExtractor
{
bool CanHandle(string webPageUrl);

Task<Extract> ExtractAsync(string webPageUrl);
}
}
7 changes: 0 additions & 7 deletions src/Elzik.Breef.Domain/IWebPageDownloader.cs

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
using Elzik.Breef.Domain;
using Microsoft.Extensions.Logging;

namespace Elzik.Breef.Infrastructure.ContentExtractors
{
public class ContentExtractorStrategy : IContentExtractor
{
private readonly ILogger<ContentExtractorStrategy> _logger;
private readonly List<IContentExtractor> _extractors;

public ContentExtractorStrategy(ILogger<ContentExtractorStrategy> logger,
IEnumerable<IContentExtractor> specificExtractors, IContentExtractor defaultExtractor)
{
ArgumentNullException.ThrowIfNull(logger);
ArgumentNullException.ThrowIfNull(specificExtractors);
ArgumentNullException.ThrowIfNull(defaultExtractor);

_logger = logger;

if (specificExtractors.Contains(defaultExtractor))
throw new ArgumentException("Default extractor should not be in the specific extractors list.");

_extractors = [.. specificExtractors, defaultExtractor];
}

public bool CanHandle(string webPageUrl) => true;

public async Task<Extract> ExtractAsync(string webPageUrl)
{
var extractor = _extractors.First(e => e.CanHandle(webPageUrl));

_logger.LogInformation("Extraction will be provided for by {ExtractorName}", extractor.GetType().Name);

return await extractor.ExtractAsync(webPageUrl);
}
}

}
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
using Elzik.Breef.Domain;
using HtmlAgilityPack;

namespace Elzik.Breef.Infrastructure;
namespace Elzik.Breef.Infrastructure.ContentExtractors;

public class ContentExtractor(IWebPageDownloader httpClient) : IContentExtractor
public class HtmlContentExtractor(IHttpClientFactory httpClientFactory) : IContentExtractor
{
public async Task<Extract> ExtractAsync(string webPageUrl)
{
var html = await httpClient.DownloadAsync(webPageUrl);
var httpClient = httpClientFactory.CreateClient("BreefDownloader");
var html = await httpClient.GetStringAsync(webPageUrl);
var htmlDocument = new HtmlDocument();
htmlDocument.LoadHtml(html);

Expand Down Expand Up @@ -78,4 +79,6 @@ private static string GetTitle(HtmlDocument htmlDocument, string defaultWhenMiss

return imageNodesSortedBySize.FirstOrDefault()?.ImageUrl;
}

public bool CanHandle(string webPageUrl) => true;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
using System.Text.Json.Serialization;

namespace Elzik.Breef.Infrastructure.ContentExtractors.Reddit.Client;

public class AboutSubreddit
{
[JsonPropertyName("data")]
public AboutSubredditData? Data { get; set; }
}

public class AboutSubredditData
{
[JsonPropertyName("public_description")]
public string? PublicDescription { get; set; }

[JsonPropertyName("icon_img")]
public string? IconImg { get; set; }

[JsonPropertyName("banner_img")]
public string? BannerImg { get; set; }

[JsonPropertyName("banner_background_image")]
public string? BannerBackgroundImage { get; set; }

[JsonPropertyName("mobile_banner_image")]
public string? MobileBannerImage { get; set; }

[JsonPropertyName("community_icon")]
public string? CommunityIcon { get; set; }
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
namespace Elzik.Breef.Infrastructure.ContentExtractors.Reddit.Client;

public interface IRawNewInSubredditTransformer
{
Task<NewInSubreddit> Transform(RawNewInSubreddit rawNewInSubreddit);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
namespace Elzik.Breef.Infrastructure.ContentExtractors.Reddit.Client
{
public interface IRedditPostClient
{
Task<RedditPost> GetPost(string postId);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
namespace Elzik.Breef.Infrastructure.ContentExtractors.Reddit.Client;

public interface ISubredditClient
{
Task<NewInSubreddit> GetNewInSubreddit(string subRedditName);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
namespace Elzik.Breef.Infrastructure.ContentExtractors.Reddit.Client;

public class NewInSubreddit
{
public List<RedditPost> Posts { get; set; } = [];
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
using System.Text.Json;
using System.Text.Json.Serialization;

namespace Elzik.Breef.Infrastructure.ContentExtractors.Reddit.Client.Raw;

public class FlexibleStringConverter : JsonConverter<string?>
{
public override string? Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options)
{
return reader.TokenType switch
{
JsonTokenType.String => reader.GetString(),
JsonTokenType.Number => reader.GetInt64().ToString(),
JsonTokenType.Null => null,
_ => throw new JsonException($"Cannot convert {reader.TokenType} to string")
};
}

public override void Write(Utf8JsonWriter writer, string? value, JsonSerializerOptions options)
{
if (value == null)
{
writer.WriteNullValue();
}
else
{
writer.WriteStringValue(value);
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
using Refit;

namespace Elzik.Breef.Infrastructure.ContentExtractors.Reddit.Client.Raw
{
public interface IRawRedditPostClient
{
[Get("/comments/{postId}.json")]
[Headers("User-Agent: breef/1.0.0 (https://github.com/elzik/breef)")]
Task<RawRedditPost> GetPost(string postId);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
namespace Elzik.Breef.Infrastructure.ContentExtractors.Reddit.Client.Raw;

public interface IRawRedditPostTransformer
{
RedditPost Transform(RawRedditPost rawRedditPost);
}
Loading