From 983497ad2ec05ac9e22f45d9cfad336fb4c6b6ad Mon Sep 17 00:00:00 2001 From: Christoph Marzell Date: Sun, 7 Dec 2025 07:00:39 +0100 Subject: [PATCH] add fetch new curses --- Gemfile | 4 +- Gemfile.lock | 12 ++ app/jobs/training_watch_job.rb | 14 ++ app/mailers/training_watch_mailer.rb | 7 + app/models/training_watch_hit.rb | 7 + app/models/training_watch_source.rb | 10 + app/services/training_watch/checker.rb | 183 ++++++++++++++++++ app/views/layouts/application.html.erb | 2 +- .../training_watch_mailer/notify.text.erb | 12 ++ config/initializers/dump_scheduler.rb | 6 + ...207054128_create_training_watch_sources.rb | 18 ++ ...251207054146_create_training_watch_hits.rb | 16 ++ db/schema.rb | 32 ++- db/seeds.rb | 16 ++ 14 files changed, 336 insertions(+), 3 deletions(-) create mode 100644 app/jobs/training_watch_job.rb create mode 100644 app/mailers/training_watch_mailer.rb create mode 100644 app/models/training_watch_hit.rb create mode 100644 app/models/training_watch_source.rb create mode 100644 app/services/training_watch/checker.rb create mode 100644 app/views/training_watch_mailer/notify.text.erb create mode 100644 db/migrate/20251207054128_create_training_watch_sources.rb create mode 100644 db/migrate/20251207054146_create_training_watch_hits.rb diff --git a/Gemfile b/Gemfile index 7d4487d..e92c973 100644 --- a/Gemfile +++ b/Gemfile @@ -59,4 +59,6 @@ end gem 'devise' gem 'rufus-scheduler' -gem 'pghero' \ No newline at end of file +gem 'pghero' +gem "faraday" +gem "nokogiri" diff --git a/Gemfile.lock b/Gemfile.lock index 038e8e1..371f8b6 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -110,6 +110,12 @@ GEM erubi (1.13.1) et-orbi (1.4.0) tzinfo + faraday (2.14.0) + faraday-net_http (>= 2.0, < 3.5) + json + logger + faraday-net_http (3.4.2) + net-http (~> 0.5) ffi (1.17.2-x64-mingw32) ffi (1.17.2-x86_64-linux-gnu) fugit (1.12.1) @@ -133,6 +139,7 @@ GEM thor (>= 0.14, < 2.0) jquery-ui-rails (8.0.0) railties (>= 3.2.16) + json (2.17.1) kaminari (1.2.2) activesupport (>= 4.1.0) kaminari-actionview (= 1.2.2) @@ -160,6 +167,8 @@ GEM minitest (5.26.1) msgpack (1.8.0) mutex_m (0.3.0) + net-http (0.8.0) + uri (>= 0.11.1) net-imap (0.4.22) date net-protocol @@ -271,6 +280,7 @@ GEM concurrent-ruby (~> 1.0) tzinfo-data (1.2025.2) tzinfo (>= 1.0.0) + uri (1.1.1) warden (1.2.9) rack (>= 2.0.9) web-console (4.2.1) @@ -292,9 +302,11 @@ DEPENDENCIES administrate bootsnap devise + faraday jbuilder jquery-rails jquery-ui-rails + nokogiri pg (~> 1.1) pghero puma (>= 5.0) diff --git a/app/jobs/training_watch_job.rb b/app/jobs/training_watch_job.rb new file mode 100644 index 0000000..9121a7e --- /dev/null +++ b/app/jobs/training_watch_job.rb @@ -0,0 +1,14 @@ +# app/jobs/training_watch_job.rb +class TrainingWatchJob < ApplicationJob + queue_as :default + + def perform + results = TrainingWatch::Checker.new.run! + return if results.empty? + + email = ENV.fetch("TRAINING_WATCH_NOTIFY_EMAIL", "christoph@marzell.net") + return if email.blank? + + TrainingWatchMailer.notify(email: email, results: results).deliver_now + end +end diff --git a/app/mailers/training_watch_mailer.rb b/app/mailers/training_watch_mailer.rb new file mode 100644 index 0000000..37c87ef --- /dev/null +++ b/app/mailers/training_watch_mailer.rb @@ -0,0 +1,7 @@ +# app/mailers/training_watch_mailer.rb +class TrainingWatchMailer < ApplicationMailer + def notify(email:, results:) + @results = results + mail(to: email, subject: "Lehrgänge 2027: neue Treffer gefunden") + end +end \ No newline at end of file diff --git a/app/models/training_watch_hit.rb b/app/models/training_watch_hit.rb new file mode 100644 index 0000000..00122bc --- /dev/null +++ b/app/models/training_watch_hit.rb @@ -0,0 +1,7 @@ +# frozen_string_literal: true +# app/models/training_watch_hit.rb +class TrainingWatchHit < ApplicationRecord + belongs_to :training_watch_source + + validates :fingerprint, presence: true, uniqueness: true +end diff --git a/app/models/training_watch_source.rb b/app/models/training_watch_source.rb new file mode 100644 index 0000000..2644199 --- /dev/null +++ b/app/models/training_watch_source.rb @@ -0,0 +1,10 @@ +# frozen_string_literal: true + +# app/models/training_watch_source.rb +class TrainingWatchSource < ApplicationRecord + has_many :training_watch_hits, dependent: :delete_all + + validates :name, :url, :kind, presence: true + validates :url, uniqueness: true + validates :kind, inclusion: { in: %w[html rss] } +end \ No newline at end of file diff --git a/app/services/training_watch/checker.rb b/app/services/training_watch/checker.rb new file mode 100644 index 0000000..67a353f --- /dev/null +++ b/app/services/training_watch/checker.rb @@ -0,0 +1,183 @@ +# app/services/training_watch/checker.rb +require "digest" +require "faraday" +require "nokogiri" + +module TrainingWatch + class Checker + DEFAULT_REGEX = / + \b2027\b + .*? + (start|beginn|lehrgang|curriculum|gruppe|kurs|aufnahme|bewerb|anmeld|termin) + /imx + + USER_AGENT = "RailsTrainingWatch/1.0 (+https://your-app.example)" + + Result = Struct.new(:source, :new_hits, keyword_init: true) + + def initialize(sources: TrainingWatchSource.where(enabled: true), logger: Rails.logger) + @sources = sources + @logger = logger + @http = Faraday.new do |f| + f.options.timeout = 15 + f.options.open_timeout = 10 + f.headers["User-Agent"] = USER_AGENT + f.adapter Faraday.default_adapter + end + end + + def run! + results = [] + + @sources.find_each do |source| + begin + new_hits = check_one!(source) + results << Result.new(source:, new_hits:) if new_hits.any? + rescue => e + @logger.warn("[TrainingWatch] source=#{source.id} #{source.name} failed: #{e.class}: #{e.message}") + ensure + source.update!(last_checked_at: Time.current) + end + end + + results + end + + private + + def check_one!(source) + response = fetch(source) + return [] if response.nil? + + case source.kind + when "html" then check_html(source, response) + when "rss" then check_rss(source, response) # optional, stub unten + else [] + end + end + + def fetch(source) + headers = {} + headers["If-None-Match"] = source.last_etag if source.last_etag.present? + headers["If-Modified-Since"] = source.last_modified if source.last_modified.present? + + resp = @http.get(source.url, nil, headers) + + if resp.status == 304 + @logger.info("[TrainingWatch] source=#{source.id} not modified") + return nil + end + + if resp.status >= 400 + raise "HTTP #{resp.status}" + end + + source.update!( + last_etag: resp.headers["etag"], + last_modified: resp.headers["last-modified"] + ) + + resp + end + + def check_html(source, response) + html = response.body.to_s + doc = Nokogiri::HTML(html) + + page_title = doc.at_css("title")&.text&.strip + text = doc.text.gsub(/\s+/, " ").strip + + regex = compile_regex(source) + return [] unless text.match?(regex) + + # Links sammeln, die auch matchen könnten (oder zumindest "2027" enthalten) + candidates = doc.css("a").map do |a| + href = a["href"].to_s.strip + next if href.empty? + + label = a.text.to_s.gsub(/\s+/, " ").strip + abs = absolutize_url(source.url, href) + { label:, url: abs } + end.compact + + # Wenn Links explizit 2027 enthalten, priorisieren, sonst Seitenhit + link_hits = candidates.select { |c| (c[:label] + " " + c[:url]).match?(/\b2027\b/i) }.first(10) + + hits = [] + if link_hits.any? + link_hits.each do |c| + hits << create_hit!( + source:, + title: c[:label].presence || page_title, + hit_url: c[:url], + snippet: build_snippet(text, regex) + ) + end + else + hits << create_hit!( + source:, + title: page_title || source.name, + hit_url: source.url, + snippet: build_snippet(text, regex) + ) + end + + hits.compact + end + + # Optional: RSS support (wenn du später Feed-URLs nutzt) + def check_rss(source, response) + # Minimal: RSS als Text matchen (robust genug für "2027") + body = response.body.to_s + regex = compile_regex(source) + return [] unless body.match?(regex) + + create = create_hit!( + source:, + title: source.name, + hit_url: source.url, + snippet: build_snippet(body.gsub(/\s+/, " "), regex) + ) + create ? [create] : [] + end + + def compile_regex(source) + return DEFAULT_REGEX if source.match_regex.blank? + Regexp.new(source.match_regex, Regexp::IGNORECASE | Regexp::MULTILINE) + rescue RegexpError + DEFAULT_REGEX + end + + def build_snippet(text, regex) + m = text.match(regex) + return text[0, 280] if m.nil? + + start = [m.begin(0) - 120, 0].max + slice = text[start, 380] || text + slice.strip + end + + def absolutize_url(base, href) + uri = URI.parse(href) + return uri.to_s if uri.absolute? + URI.join(base, href).to_s + rescue + href + end + + def create_hit!(source:, title:, hit_url:, snippet:, published_at: nil) + fp = Digest::SHA256.hexdigest([source.id, title.to_s, hit_url.to_s, snippet.to_s[0, 200]].join("|")) + + TrainingWatchHit.create!( + training_watch_source: source, + title: title.to_s.strip.presence, + hit_url: hit_url.to_s.strip.presence, + snippet: snippet.to_s.strip.presence, + published_at:, + fingerprint: fp + ) + rescue ActiveRecord::RecordNotUnique + nil + end + end +end diff --git a/app/views/layouts/application.html.erb b/app/views/layouts/application.html.erb index 700d510..6386cb5 100644 --- a/app/views/layouts/application.html.erb +++ b/app/views/layouts/application.html.erb @@ -17,7 +17,7 @@ - +