You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

183 lines
4.5 KiB

# app/services/training_watch/checker.rb
require "digest"
require "faraday"
require "nokogiri"
module TrainingWatch
class Checker
DEFAULT_REGEX = /
\b2027\b
.*?
(start|beginn|lehrgang|curriculum|gruppe|kurs|aufnahme|bewerb|anmeld|termin)
/imx
USER_AGENT = "RailsTrainingWatch/1.0 (+https://your-app.example)"
Result = Struct.new(:source, :new_hits, keyword_init: true)
def initialize(sources: TrainingWatchSource.where(enabled: true), logger: Rails.logger)
@sources = sources
@logger = logger
@http = Faraday.new do |f|
f.options.timeout = 15
f.options.open_timeout = 10
f.headers["User-Agent"] = USER_AGENT
f.adapter Faraday.default_adapter
end
end
def run!
results = []
@sources.find_each do |source|
begin
new_hits = check_one!(source)
results << Result.new(source:, new_hits:) if new_hits.any?
rescue => e
@logger.warn("[TrainingWatch] source=#{source.id} #{source.name} failed: #{e.class}: #{e.message}")
ensure
source.update!(last_checked_at: Time.current)
end
end
results
end
private
def check_one!(source)
response = fetch(source)
return [] if response.nil?
case source.kind
when "html" then check_html(source, response)
when "rss" then check_rss(source, response) # optional, stub unten
else []
end
end
def fetch(source)
headers = {}
headers["If-None-Match"] = source.last_etag if source.last_etag.present?
headers["If-Modified-Since"] = source.last_modified if source.last_modified.present?
resp = @http.get(source.url, nil, headers)
if resp.status == 304
@logger.info("[TrainingWatch] source=#{source.id} not modified")
return nil
end
if resp.status >= 400
raise "HTTP #{resp.status}"
end
source.update!(
last_etag: resp.headers["etag"],
last_modified: resp.headers["last-modified"]
)
resp
end
def check_html(source, response)
html = response.body.to_s
doc = Nokogiri::HTML(html)
page_title = doc.at_css("title")&.text&.strip
text = doc.text.gsub(/\s+/, " ").strip
regex = compile_regex(source)
return [] unless text.match?(regex)
# Links sammeln, die auch matchen könnten (oder zumindest "2027" enthalten)
candidates = doc.css("a").map do |a|
href = a["href"].to_s.strip
next if href.empty?
label = a.text.to_s.gsub(/\s+/, " ").strip
abs = absolutize_url(source.url, href)
{ label:, url: abs }
end.compact
# Wenn Links explizit 2027 enthalten, priorisieren, sonst Seitenhit
link_hits = candidates.select { |c| (c[:label] + " " + c[:url]).match?(/\b2027\b/i) }.first(10)
hits = []
if link_hits.any?
link_hits.each do |c|
hits << create_hit!(
source:,
title: c[:label].presence || page_title,
hit_url: c[:url],
snippet: build_snippet(text, regex)
)
end
else
hits << create_hit!(
source:,
title: page_title || source.name,
hit_url: source.url,
snippet: build_snippet(text, regex)
)
end
hits.compact
end
# Optional: RSS support (wenn du später Feed-URLs nutzt)
def check_rss(source, response)
# Minimal: RSS als Text matchen (robust genug für "2027")
body = response.body.to_s
regex = compile_regex(source)
return [] unless body.match?(regex)
create = create_hit!(
source:,
title: source.name,
hit_url: source.url,
snippet: build_snippet(body.gsub(/\s+/, " "), regex)
)
create ? [create] : []
end
def compile_regex(source)
return DEFAULT_REGEX if source.match_regex.blank?
Regexp.new(source.match_regex, Regexp::IGNORECASE | Regexp::MULTILINE)
rescue RegexpError
DEFAULT_REGEX
end
def build_snippet(text, regex)
m = text.match(regex)
return text[0, 280] if m.nil?
start = [m.begin(0) - 120, 0].max
slice = text[start, 380] || text
slice.strip
end
def absolutize_url(base, href)
uri = URI.parse(href)
return uri.to_s if uri.absolute?
URI.join(base, href).to_s
rescue
href
end
def create_hit!(source:, title:, hit_url:, snippet:, published_at: nil)
fp = Digest::SHA256.hexdigest([source.id, title.to_s, hit_url.to_s, snippet.to_s[0, 200]].join("|"))
TrainingWatchHit.create!(
training_watch_source: source,
title: title.to_s.strip.presence,
hit_url: hit_url.to_s.strip.presence,
snippet: snippet.to_s.strip.presence,
published_at:,
fingerprint: fp
)
rescue ActiveRecord::RecordNotUnique
nil
end
end
end