14 changed files with 336 additions and 3 deletions
@ -0,0 +1,14 @@ |
|||
# app/jobs/training_watch_job.rb |
|||
class TrainingWatchJob < ApplicationJob |
|||
queue_as :default |
|||
|
|||
def perform |
|||
results = TrainingWatch::Checker.new.run! |
|||
return if results.empty? |
|||
|
|||
email = ENV.fetch("TRAINING_WATCH_NOTIFY_EMAIL", "christoph@marzell.net") |
|||
return if email.blank? |
|||
|
|||
TrainingWatchMailer.notify(email: email, results: results).deliver_now |
|||
end |
|||
end |
|||
@ -0,0 +1,7 @@ |
|||
# app/mailers/training_watch_mailer.rb |
|||
class TrainingWatchMailer < ApplicationMailer |
|||
def notify(email:, results:) |
|||
@results = results |
|||
mail(to: email, subject: "Lehrgänge 2027: neue Treffer gefunden") |
|||
end |
|||
end |
|||
@ -0,0 +1,7 @@ |
|||
# frozen_string_literal: true |
|||
# app/models/training_watch_hit.rb |
|||
class TrainingWatchHit < ApplicationRecord |
|||
belongs_to :training_watch_source |
|||
|
|||
validates :fingerprint, presence: true, uniqueness: true |
|||
end |
|||
@ -0,0 +1,10 @@ |
|||
# frozen_string_literal: true |
|||
|
|||
# app/models/training_watch_source.rb |
|||
class TrainingWatchSource < ApplicationRecord |
|||
has_many :training_watch_hits, dependent: :delete_all |
|||
|
|||
validates :name, :url, :kind, presence: true |
|||
validates :url, uniqueness: true |
|||
validates :kind, inclusion: { in: %w[html rss] } |
|||
end |
|||
@ -0,0 +1,183 @@ |
|||
# app/services/training_watch/checker.rb |
|||
require "digest" |
|||
require "faraday" |
|||
require "nokogiri" |
|||
|
|||
module TrainingWatch |
|||
class Checker |
|||
DEFAULT_REGEX = / |
|||
\b2027\b |
|||
.*? |
|||
(start|beginn|lehrgang|curriculum|gruppe|kurs|aufnahme|bewerb|anmeld|termin) |
|||
/imx |
|||
|
|||
USER_AGENT = "RailsTrainingWatch/1.0 (+https://your-app.example)" |
|||
|
|||
Result = Struct.new(:source, :new_hits, keyword_init: true) |
|||
|
|||
def initialize(sources: TrainingWatchSource.where(enabled: true), logger: Rails.logger) |
|||
@sources = sources |
|||
@logger = logger |
|||
@http = Faraday.new do |f| |
|||
f.options.timeout = 15 |
|||
f.options.open_timeout = 10 |
|||
f.headers["User-Agent"] = USER_AGENT |
|||
f.adapter Faraday.default_adapter |
|||
end |
|||
end |
|||
|
|||
def run! |
|||
results = [] |
|||
|
|||
@sources.find_each do |source| |
|||
begin |
|||
new_hits = check_one!(source) |
|||
results << Result.new(source:, new_hits:) if new_hits.any? |
|||
rescue => e |
|||
@logger.warn("[TrainingWatch] source=#{source.id} #{source.name} failed: #{e.class}: #{e.message}") |
|||
ensure |
|||
source.update!(last_checked_at: Time.current) |
|||
end |
|||
end |
|||
|
|||
results |
|||
end |
|||
|
|||
private |
|||
|
|||
def check_one!(source) |
|||
response = fetch(source) |
|||
return [] if response.nil? |
|||
|
|||
case source.kind |
|||
when "html" then check_html(source, response) |
|||
when "rss" then check_rss(source, response) # optional, stub unten |
|||
else [] |
|||
end |
|||
end |
|||
|
|||
def fetch(source) |
|||
headers = {} |
|||
headers["If-None-Match"] = source.last_etag if source.last_etag.present? |
|||
headers["If-Modified-Since"] = source.last_modified if source.last_modified.present? |
|||
|
|||
resp = @http.get(source.url, nil, headers) |
|||
|
|||
if resp.status == 304 |
|||
@logger.info("[TrainingWatch] source=#{source.id} not modified") |
|||
return nil |
|||
end |
|||
|
|||
if resp.status >= 400 |
|||
raise "HTTP #{resp.status}" |
|||
end |
|||
|
|||
source.update!( |
|||
last_etag: resp.headers["etag"], |
|||
last_modified: resp.headers["last-modified"] |
|||
) |
|||
|
|||
resp |
|||
end |
|||
|
|||
def check_html(source, response) |
|||
html = response.body.to_s |
|||
doc = Nokogiri::HTML(html) |
|||
|
|||
page_title = doc.at_css("title")&.text&.strip |
|||
text = doc.text.gsub(/\s+/, " ").strip |
|||
|
|||
regex = compile_regex(source) |
|||
return [] unless text.match?(regex) |
|||
|
|||
# Links sammeln, die auch matchen könnten (oder zumindest "2027" enthalten) |
|||
candidates = doc.css("a").map do |a| |
|||
href = a["href"].to_s.strip |
|||
next if href.empty? |
|||
|
|||
label = a.text.to_s.gsub(/\s+/, " ").strip |
|||
abs = absolutize_url(source.url, href) |
|||
{ label:, url: abs } |
|||
end.compact |
|||
|
|||
# Wenn Links explizit 2027 enthalten, priorisieren, sonst Seitenhit |
|||
link_hits = candidates.select { |c| (c[:label] + " " + c[:url]).match?(/\b2027\b/i) }.first(10) |
|||
|
|||
hits = [] |
|||
if link_hits.any? |
|||
link_hits.each do |c| |
|||
hits << create_hit!( |
|||
source:, |
|||
title: c[:label].presence || page_title, |
|||
hit_url: c[:url], |
|||
snippet: build_snippet(text, regex) |
|||
) |
|||
end |
|||
else |
|||
hits << create_hit!( |
|||
source:, |
|||
title: page_title || source.name, |
|||
hit_url: source.url, |
|||
snippet: build_snippet(text, regex) |
|||
) |
|||
end |
|||
|
|||
hits.compact |
|||
end |
|||
|
|||
# Optional: RSS support (wenn du später Feed-URLs nutzt) |
|||
def check_rss(source, response) |
|||
# Minimal: RSS als Text matchen (robust genug für "2027") |
|||
body = response.body.to_s |
|||
regex = compile_regex(source) |
|||
return [] unless body.match?(regex) |
|||
|
|||
create = create_hit!( |
|||
source:, |
|||
title: source.name, |
|||
hit_url: source.url, |
|||
snippet: build_snippet(body.gsub(/\s+/, " "), regex) |
|||
) |
|||
create ? [create] : [] |
|||
end |
|||
|
|||
def compile_regex(source) |
|||
return DEFAULT_REGEX if source.match_regex.blank? |
|||
Regexp.new(source.match_regex, Regexp::IGNORECASE | Regexp::MULTILINE) |
|||
rescue RegexpError |
|||
DEFAULT_REGEX |
|||
end |
|||
|
|||
def build_snippet(text, regex) |
|||
m = text.match(regex) |
|||
return text[0, 280] if m.nil? |
|||
|
|||
start = [m.begin(0) - 120, 0].max |
|||
slice = text[start, 380] || text |
|||
slice.strip |
|||
end |
|||
|
|||
def absolutize_url(base, href) |
|||
uri = URI.parse(href) |
|||
return uri.to_s if uri.absolute? |
|||
URI.join(base, href).to_s |
|||
rescue |
|||
href |
|||
end |
|||
|
|||
def create_hit!(source:, title:, hit_url:, snippet:, published_at: nil) |
|||
fp = Digest::SHA256.hexdigest([source.id, title.to_s, hit_url.to_s, snippet.to_s[0, 200]].join("|")) |
|||
|
|||
TrainingWatchHit.create!( |
|||
training_watch_source: source, |
|||
title: title.to_s.strip.presence, |
|||
hit_url: hit_url.to_s.strip.presence, |
|||
snippet: snippet.to_s.strip.presence, |
|||
published_at:, |
|||
fingerprint: fp |
|||
) |
|||
rescue ActiveRecord::RecordNotUnique |
|||
nil |
|||
end |
|||
end |
|||
end |
|||
@ -0,0 +1,12 @@ |
|||
<!-- app/views/training_watch_mailer/notify.text.erb --> |
|||
Neue Treffer für Lehrgänge Start 2027: |
|||
|
|||
<% @results.each do |r| %> |
|||
Quelle: <%= r.source.name %> (<%= r.source.url %>) |
|||
<% r.new_hits.each do |h| %> |
|||
- <%= h.title.presence || "Treffer" %> |
|||
<%= h.hit_url %> |
|||
<%= h.snippet %> |
|||
<% end %> |
|||
|
|||
<% end %> |
|||
@ -0,0 +1,18 @@ |
|||
class CreateTrainingWatchSources < ActiveRecord::Migration[7.1] |
|||
def change |
|||
create_table :training_watch_sources do |t| |
|||
t.string :name, null: false |
|||
t.string :url, null: false |
|||
t.string :kind, null: false, default: "html" # html | rss (optional) |
|||
t.string :match_regex # optional: überschreibt Default-Matcher |
|||
t.string :last_etag |
|||
t.string :last_modified |
|||
t.datetime :last_checked_at |
|||
t.boolean :enabled, null: false, default: true |
|||
t.timestamps |
|||
end |
|||
|
|||
add_index :training_watch_sources, :enabled |
|||
add_index :training_watch_sources, :url, unique: true |
|||
end |
|||
end |
|||
@ -0,0 +1,16 @@ |
|||
class CreateTrainingWatchHits < ActiveRecord::Migration[7.1] |
|||
def change |
|||
create_table :training_watch_hits do |t| |
|||
t.references :training_watch_source, null: false, foreign_key: true |
|||
t.string :title |
|||
t.string :hit_url |
|||
t.datetime :published_at |
|||
t.text :snippet |
|||
t.string :fingerprint, null: false |
|||
t.timestamps |
|||
end |
|||
|
|||
add_index :training_watch_hits, :fingerprint, unique: true |
|||
add_index :training_watch_hits, :created_at |
|||
end |
|||
end |
|||
Loading…
Reference in new issue