# app/services/training_watch/checker.rb require "digest" require "faraday" require "nokogiri" module TrainingWatch class Checker DEFAULT_REGEX = / \b2027\b .*? (start|beginn|lehrgang|curriculum|gruppe|kurs|aufnahme|bewerb|anmeld|termin) /imx USER_AGENT = "RailsTrainingWatch/1.0" Result = Struct.new(:source, :new_hits, keyword_init: true) def initialize(sources: TrainingWatchSource.where(enabled: true), logger: Rails.logger) @sources = sources @logger = logger @http = Faraday.new do |f| f.options.timeout = 15 f.options.open_timeout = 10 f.headers["User-Agent"] = USER_AGENT f.adapter Faraday.default_adapter end end def run! results = [] @sources.find_each do |source| begin new_hits = check_one!(source) results << Result.new(source: source, new_hits: new_hits) if new_hits.any? rescue => e @logger.warn("[TrainingWatch] source=#{source.id} #{source.name} failed: #{e.class}: #{e.message}") ensure source.update!(last_checked_at: Time.current) end end results end private def check_one!(source) response = fetch(source) return [] if response.nil? case source.kind when "html" then check_html(source, response) when "rss" then check_rss(source, response) # optional, stub unten else [] end end def fetch(source) headers = {} headers["If-None-Match"] = source.last_etag if source.last_etag.present? headers["If-Modified-Since"] = source.last_modified if source.last_modified.present? resp = @http.get(source.url, nil, headers) if resp.status == 304 @logger.info("[TrainingWatch] source=#{source.id} not modified") return nil end if resp.status >= 400 raise "HTTP #{resp.status}" end source.update!( last_etag: resp.headers["etag"], last_modified: resp.headers["last-modified"] ) resp end def check_html(source, response) html = response.body.to_s doc = Nokogiri::HTML(html) page_title = doc.at_css("title")&.text&.strip text = doc.text.gsub(/\s+/, " ").strip regex = compile_regex(source) return [] unless text.match?(regex) # Links sammeln, die auch matchen könnten (oder zumindest "2027" enthalten) candidates = doc.css("a").map do |a| href = a["href"].to_s.strip next if href.empty? label = a.text.to_s.gsub(/\s+/, " ").strip abs = absolutize_url(source.url, href) { label: label, url: abs } end.compact # Wenn Links explizit 2027 enthalten, priorisieren, sonst Seitenhit link_hits = candidates.select { |c| (c[:label] + " " + c[:url]).match?(/\b2027\b/i) }.first(10) hits = [] if link_hits.any? link_hits.each do |c| hits << create_hit!( source: source, title: c[:label].presence || page_title, hit_url: c[:url], snippet: build_snippet(text, regex) ) end else hits << create_hit!( source: source, title: page_title || source.name, hit_url: source.url, snippet: build_snippet(text, regex) ) end hits.compact end # Optional: RSS support (wenn du später Feed-URLs nutzt) def check_rss(source, response) # Minimal: RSS als Text matchen (robust genug für "2027") body = response.body.to_s regex = compile_regex(source) return [] unless body.match?(regex) create = create_hit!( source: source, title: source.name, hit_url: source.url, snippet: build_snippet(body.gsub(/\s+/, " "), regex) ) create ? [create] : [] end def compile_regex(source) return DEFAULT_REGEX if source.match_regex.blank? Regexp.new(source.match_regex, Regexp::IGNORECASE | Regexp::MULTILINE) rescue RegexpError DEFAULT_REGEX end def build_snippet(text, regex) m = text.match(regex) return text[0, 280] if m.nil? start = [m.begin(0) - 120, 0].max slice = text[start, 380] || text slice.strip end def absolutize_url(base, href) uri = URI.parse(href) return uri.to_s if uri.absolute? URI.join(base, href).to_s rescue href end def create_hit!(source:, title:, hit_url:, snippet:, published_at: nil) fp = Digest::SHA256.hexdigest([source.id, title.to_s, hit_url.to_s, snippet.to_s[0, 200]].join("|")) TrainingWatchHit.create!( training_watch_source: source, title: title.to_s.strip.presence, hit_url: hit_url.to_s.strip.presence, snippet: snippet.to_s.strip.presence, published_at: published_at, fingerprint: fp ) rescue ActiveRecord::RecordNotUnique nil end end end