praktikum/app/services/training_watch/checker.rb


								# app/services/training_watch/checker.rb

								require "digest"

								require "faraday"

								require "nokogiri"


								module TrainingWatch

									class Checker

										DEFAULT_REGEX = /

								      \b2027\b

								      .*?

								      (start|beginn|lehrgang|curriculum|gruppe|kurs|aufnahme|bewerb|anmeld|termin)

								    /imx


										USER_AGENT = "RailsTrainingWatch/1.0 (+https://your-app.example)"


										Result = Struct.new(:source, :new_hits, keyword_init: true)


										def initialize(sources: TrainingWatchSource.where(enabled: true), logger: Rails.logger)

											@sources = sources

											@logger = logger

											@http = Faraday.new do |f|

												f.options.timeout = 15

												f.options.open_timeout = 10

												f.headers["User-Agent"] = USER_AGENT

												f.adapter Faraday.default_adapter

											end

										end


										def run!

											results = []


											@sources.find_each do |source|

												begin

													new_hits = check_one!(source)

													results << Result.new(source:, new_hits:) if new_hits.any?

												rescue => e

													@logger.warn("[TrainingWatch] source=#{source.id} #{source.name} failed: #{e.class}: #{e.message}")

												ensure

													source.update!(last_checked_at: Time.current)

												end

											end


											results

										end


										private


										def check_one!(source)

											response = fetch(source)

											return [] if response.nil?


											case source.kind

												when "html" then check_html(source, response)

												when "rss"  then check_rss(source, response) # optional, stub unten

												else []

											end

										end


										def fetch(source)

											headers = {}

											headers["If-None-Match"] = source.last_etag if source.last_etag.present?

											headers["If-Modified-Since"] = source.last_modified if source.last_modified.present?


											resp = @http.get(source.url, nil, headers)


											if resp.status == 304

												@logger.info("[TrainingWatch] source=#{source.id} not modified")

												return nil

											end


											if resp.status >= 400

												raise "HTTP #{resp.status}"

											end


											source.update!(

												last_etag: resp.headers["etag"],

												last_modified: resp.headers["last-modified"]

											)


											resp

										end


										def check_html(source, response)

											html = response.body.to_s

											doc = Nokogiri::HTML(html)


											page_title = doc.at_css("title")&.text&.strip

											text = doc.text.gsub(/\s+/, " ").strip


											regex = compile_regex(source)

											return [] unless text.match?(regex)


											# Links sammeln, die auch matchen könnten (oder zumindest "2027" enthalten)

											candidates = doc.css("a").map do |a|

												href = a["href"].to_s.strip

												next if href.empty?


												label = a.text.to_s.gsub(/\s+/, " ").strip

												abs = absolutize_url(source.url, href)

												{ label:, url: abs }

											end.compact


											# Wenn Links explizit 2027 enthalten, priorisieren, sonst Seitenhit

											link_hits = candidates.select { |c| (c[:label] + " " + c[:url]).match?(/\b2027\b/i) }.first(10)


											hits = []

											if link_hits.any?

												link_hits.each do |c|

													hits << create_hit!(

														source:,

														title: c[:label].presence || page_title,

														hit_url: c[:url],

														snippet: build_snippet(text, regex)

													)

												end

											else

												hits << create_hit!(

													source:,

													title: page_title || source.name,

													hit_url: source.url,

													snippet: build_snippet(text, regex)

												)

											end


											hits.compact

										end


										# Optional: RSS support (wenn du später Feed-URLs nutzt)

										def check_rss(source, response)

											# Minimal: RSS als Text matchen (robust genug für "2027")

											body = response.body.to_s

											regex = compile_regex(source)

											return [] unless body.match?(regex)


											create = create_hit!(

												source:,

												title: source.name,

												hit_url: source.url,

												snippet: build_snippet(body.gsub(/\s+/, " "), regex)

											)

											create ? [create] : []

										end


										def compile_regex(source)

											return DEFAULT_REGEX if source.match_regex.blank?

											Regexp.new(source.match_regex, Regexp::IGNORECASE | Regexp::MULTILINE)

										rescue RegexpError

											DEFAULT_REGEX

										end


										def build_snippet(text, regex)

											m = text.match(regex)

											return text[0, 280] if m.nil?


											start = [m.begin(0) - 120, 0].max

											slice = text[start, 380] || text

											slice.strip

										end


										def absolutize_url(base, href)

											uri = URI.parse(href)

											return uri.to_s if uri.absolute?

											URI.join(base, href).to_s

										rescue

											href

										end


										def create_hit!(source:, title:, hit_url:, snippet:, published_at: nil)

											fp = Digest::SHA256.hexdigest([source.id, title.to_s, hit_url.to_s, snippet.to_s[0, 200]].join("|"))


											TrainingWatchHit.create!(

												training_watch_source: source,

												title: title.to_s.strip.presence,

												hit_url: hit_url.to_s.strip.presence,

												snippet: snippet.to_s.strip.presence,

												published_at:,

												fingerprint: fp

											)

										rescue ActiveRecord::RecordNotUnique

											nil

										end

									end

								end