14 changed files with 336 additions and 3 deletions
@ -0,0 +1,14 @@ |
|||||
|
# app/jobs/training_watch_job.rb |
||||
|
class TrainingWatchJob < ApplicationJob |
||||
|
queue_as :default |
||||
|
|
||||
|
def perform |
||||
|
results = TrainingWatch::Checker.new.run! |
||||
|
return if results.empty? |
||||
|
|
||||
|
email = ENV.fetch("TRAINING_WATCH_NOTIFY_EMAIL", "christoph@marzell.net") |
||||
|
return if email.blank? |
||||
|
|
||||
|
TrainingWatchMailer.notify(email: email, results: results).deliver_now |
||||
|
end |
||||
|
end |
||||
@ -0,0 +1,7 @@ |
|||||
|
# app/mailers/training_watch_mailer.rb |
||||
|
class TrainingWatchMailer < ApplicationMailer |
||||
|
def notify(email:, results:) |
||||
|
@results = results |
||||
|
mail(to: email, subject: "Lehrgänge 2027: neue Treffer gefunden") |
||||
|
end |
||||
|
end |
||||
@ -0,0 +1,7 @@ |
|||||
|
# frozen_string_literal: true |
||||
|
# app/models/training_watch_hit.rb |
||||
|
class TrainingWatchHit < ApplicationRecord |
||||
|
belongs_to :training_watch_source |
||||
|
|
||||
|
validates :fingerprint, presence: true, uniqueness: true |
||||
|
end |
||||
@ -0,0 +1,10 @@ |
|||||
|
# frozen_string_literal: true |
||||
|
|
||||
|
# app/models/training_watch_source.rb |
||||
|
class TrainingWatchSource < ApplicationRecord |
||||
|
has_many :training_watch_hits, dependent: :delete_all |
||||
|
|
||||
|
validates :name, :url, :kind, presence: true |
||||
|
validates :url, uniqueness: true |
||||
|
validates :kind, inclusion: { in: %w[html rss] } |
||||
|
end |
||||
@ -0,0 +1,183 @@ |
|||||
|
# app/services/training_watch/checker.rb |
||||
|
require "digest" |
||||
|
require "faraday" |
||||
|
require "nokogiri" |
||||
|
|
||||
|
module TrainingWatch |
||||
|
class Checker |
||||
|
DEFAULT_REGEX = / |
||||
|
\b2027\b |
||||
|
.*? |
||||
|
(start|beginn|lehrgang|curriculum|gruppe|kurs|aufnahme|bewerb|anmeld|termin) |
||||
|
/imx |
||||
|
|
||||
|
USER_AGENT = "RailsTrainingWatch/1.0 (+https://your-app.example)" |
||||
|
|
||||
|
Result = Struct.new(:source, :new_hits, keyword_init: true) |
||||
|
|
||||
|
def initialize(sources: TrainingWatchSource.where(enabled: true), logger: Rails.logger) |
||||
|
@sources = sources |
||||
|
@logger = logger |
||||
|
@http = Faraday.new do |f| |
||||
|
f.options.timeout = 15 |
||||
|
f.options.open_timeout = 10 |
||||
|
f.headers["User-Agent"] = USER_AGENT |
||||
|
f.adapter Faraday.default_adapter |
||||
|
end |
||||
|
end |
||||
|
|
||||
|
def run! |
||||
|
results = [] |
||||
|
|
||||
|
@sources.find_each do |source| |
||||
|
begin |
||||
|
new_hits = check_one!(source) |
||||
|
results << Result.new(source:, new_hits:) if new_hits.any? |
||||
|
rescue => e |
||||
|
@logger.warn("[TrainingWatch] source=#{source.id} #{source.name} failed: #{e.class}: #{e.message}") |
||||
|
ensure |
||||
|
source.update!(last_checked_at: Time.current) |
||||
|
end |
||||
|
end |
||||
|
|
||||
|
results |
||||
|
end |
||||
|
|
||||
|
private |
||||
|
|
||||
|
def check_one!(source) |
||||
|
response = fetch(source) |
||||
|
return [] if response.nil? |
||||
|
|
||||
|
case source.kind |
||||
|
when "html" then check_html(source, response) |
||||
|
when "rss" then check_rss(source, response) # optional, stub unten |
||||
|
else [] |
||||
|
end |
||||
|
end |
||||
|
|
||||
|
def fetch(source) |
||||
|
headers = {} |
||||
|
headers["If-None-Match"] = source.last_etag if source.last_etag.present? |
||||
|
headers["If-Modified-Since"] = source.last_modified if source.last_modified.present? |
||||
|
|
||||
|
resp = @http.get(source.url, nil, headers) |
||||
|
|
||||
|
if resp.status == 304 |
||||
|
@logger.info("[TrainingWatch] source=#{source.id} not modified") |
||||
|
return nil |
||||
|
end |
||||
|
|
||||
|
if resp.status >= 400 |
||||
|
raise "HTTP #{resp.status}" |
||||
|
end |
||||
|
|
||||
|
source.update!( |
||||
|
last_etag: resp.headers["etag"], |
||||
|
last_modified: resp.headers["last-modified"] |
||||
|
) |
||||
|
|
||||
|
resp |
||||
|
end |
||||
|
|
||||
|
def check_html(source, response) |
||||
|
html = response.body.to_s |
||||
|
doc = Nokogiri::HTML(html) |
||||
|
|
||||
|
page_title = doc.at_css("title")&.text&.strip |
||||
|
text = doc.text.gsub(/\s+/, " ").strip |
||||
|
|
||||
|
regex = compile_regex(source) |
||||
|
return [] unless text.match?(regex) |
||||
|
|
||||
|
# Links sammeln, die auch matchen könnten (oder zumindest "2027" enthalten) |
||||
|
candidates = doc.css("a").map do |a| |
||||
|
href = a["href"].to_s.strip |
||||
|
next if href.empty? |
||||
|
|
||||
|
label = a.text.to_s.gsub(/\s+/, " ").strip |
||||
|
abs = absolutize_url(source.url, href) |
||||
|
{ label:, url: abs } |
||||
|
end.compact |
||||
|
|
||||
|
# Wenn Links explizit 2027 enthalten, priorisieren, sonst Seitenhit |
||||
|
link_hits = candidates.select { |c| (c[:label] + " " + c[:url]).match?(/\b2027\b/i) }.first(10) |
||||
|
|
||||
|
hits = [] |
||||
|
if link_hits.any? |
||||
|
link_hits.each do |c| |
||||
|
hits << create_hit!( |
||||
|
source:, |
||||
|
title: c[:label].presence || page_title, |
||||
|
hit_url: c[:url], |
||||
|
snippet: build_snippet(text, regex) |
||||
|
) |
||||
|
end |
||||
|
else |
||||
|
hits << create_hit!( |
||||
|
source:, |
||||
|
title: page_title || source.name, |
||||
|
hit_url: source.url, |
||||
|
snippet: build_snippet(text, regex) |
||||
|
) |
||||
|
end |
||||
|
|
||||
|
hits.compact |
||||
|
end |
||||
|
|
||||
|
# Optional: RSS support (wenn du später Feed-URLs nutzt) |
||||
|
def check_rss(source, response) |
||||
|
# Minimal: RSS als Text matchen (robust genug für "2027") |
||||
|
body = response.body.to_s |
||||
|
regex = compile_regex(source) |
||||
|
return [] unless body.match?(regex) |
||||
|
|
||||
|
create = create_hit!( |
||||
|
source:, |
||||
|
title: source.name, |
||||
|
hit_url: source.url, |
||||
|
snippet: build_snippet(body.gsub(/\s+/, " "), regex) |
||||
|
) |
||||
|
create ? [create] : [] |
||||
|
end |
||||
|
|
||||
|
def compile_regex(source) |
||||
|
return DEFAULT_REGEX if source.match_regex.blank? |
||||
|
Regexp.new(source.match_regex, Regexp::IGNORECASE | Regexp::MULTILINE) |
||||
|
rescue RegexpError |
||||
|
DEFAULT_REGEX |
||||
|
end |
||||
|
|
||||
|
def build_snippet(text, regex) |
||||
|
m = text.match(regex) |
||||
|
return text[0, 280] if m.nil? |
||||
|
|
||||
|
start = [m.begin(0) - 120, 0].max |
||||
|
slice = text[start, 380] || text |
||||
|
slice.strip |
||||
|
end |
||||
|
|
||||
|
def absolutize_url(base, href) |
||||
|
uri = URI.parse(href) |
||||
|
return uri.to_s if uri.absolute? |
||||
|
URI.join(base, href).to_s |
||||
|
rescue |
||||
|
href |
||||
|
end |
||||
|
|
||||
|
def create_hit!(source:, title:, hit_url:, snippet:, published_at: nil) |
||||
|
fp = Digest::SHA256.hexdigest([source.id, title.to_s, hit_url.to_s, snippet.to_s[0, 200]].join("|")) |
||||
|
|
||||
|
TrainingWatchHit.create!( |
||||
|
training_watch_source: source, |
||||
|
title: title.to_s.strip.presence, |
||||
|
hit_url: hit_url.to_s.strip.presence, |
||||
|
snippet: snippet.to_s.strip.presence, |
||||
|
published_at:, |
||||
|
fingerprint: fp |
||||
|
) |
||||
|
rescue ActiveRecord::RecordNotUnique |
||||
|
nil |
||||
|
end |
||||
|
end |
||||
|
end |
||||
@ -0,0 +1,12 @@ |
|||||
|
<!-- app/views/training_watch_mailer/notify.text.erb --> |
||||
|
Neue Treffer für Lehrgänge Start 2027: |
||||
|
|
||||
|
<% @results.each do |r| %> |
||||
|
Quelle: <%= r.source.name %> (<%= r.source.url %>) |
||||
|
<% r.new_hits.each do |h| %> |
||||
|
- <%= h.title.presence || "Treffer" %> |
||||
|
<%= h.hit_url %> |
||||
|
<%= h.snippet %> |
||||
|
<% end %> |
||||
|
|
||||
|
<% end %> |
||||
@ -0,0 +1,18 @@ |
|||||
|
class CreateTrainingWatchSources < ActiveRecord::Migration[7.1] |
||||
|
def change |
||||
|
create_table :training_watch_sources do |t| |
||||
|
t.string :name, null: false |
||||
|
t.string :url, null: false |
||||
|
t.string :kind, null: false, default: "html" # html | rss (optional) |
||||
|
t.string :match_regex # optional: überschreibt Default-Matcher |
||||
|
t.string :last_etag |
||||
|
t.string :last_modified |
||||
|
t.datetime :last_checked_at |
||||
|
t.boolean :enabled, null: false, default: true |
||||
|
t.timestamps |
||||
|
end |
||||
|
|
||||
|
add_index :training_watch_sources, :enabled |
||||
|
add_index :training_watch_sources, :url, unique: true |
||||
|
end |
||||
|
end |
||||
@ -0,0 +1,16 @@ |
|||||
|
class CreateTrainingWatchHits < ActiveRecord::Migration[7.1] |
||||
|
def change |
||||
|
create_table :training_watch_hits do |t| |
||||
|
t.references :training_watch_source, null: false, foreign_key: true |
||||
|
t.string :title |
||||
|
t.string :hit_url |
||||
|
t.datetime :published_at |
||||
|
t.text :snippet |
||||
|
t.string :fingerprint, null: false |
||||
|
t.timestamps |
||||
|
end |
||||
|
|
||||
|
add_index :training_watch_hits, :fingerprint, unique: true |
||||
|
add_index :training_watch_hits, :created_at |
||||
|
end |
||||
|
end |
||||
Loading…
Reference in new issue