You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
183 lines
4.5 KiB
183 lines
4.5 KiB
# app/services/training_watch/checker.rb
|
|
require "digest"
|
|
require "faraday"
|
|
require "nokogiri"
|
|
|
|
module TrainingWatch
|
|
class Checker
|
|
DEFAULT_REGEX = /
|
|
\b2027\b
|
|
.*?
|
|
(start|beginn|lehrgang|curriculum|gruppe|kurs|aufnahme|bewerb|anmeld|termin)
|
|
/imx
|
|
|
|
USER_AGENT = "RailsTrainingWatch/1.0 (+https://your-app.example)"
|
|
|
|
Result = Struct.new(:source, :new_hits, keyword_init: true)
|
|
|
|
def initialize(sources: TrainingWatchSource.where(enabled: true), logger: Rails.logger)
|
|
@sources = sources
|
|
@logger = logger
|
|
@http = Faraday.new do |f|
|
|
f.options.timeout = 15
|
|
f.options.open_timeout = 10
|
|
f.headers["User-Agent"] = USER_AGENT
|
|
f.adapter Faraday.default_adapter
|
|
end
|
|
end
|
|
|
|
def run!
|
|
results = []
|
|
|
|
@sources.find_each do |source|
|
|
begin
|
|
new_hits = check_one!(source)
|
|
results << Result.new(source:, new_hits:) if new_hits.any?
|
|
rescue => e
|
|
@logger.warn("[TrainingWatch] source=#{source.id} #{source.name} failed: #{e.class}: #{e.message}")
|
|
ensure
|
|
source.update!(last_checked_at: Time.current)
|
|
end
|
|
end
|
|
|
|
results
|
|
end
|
|
|
|
private
|
|
|
|
def check_one!(source)
|
|
response = fetch(source)
|
|
return [] if response.nil?
|
|
|
|
case source.kind
|
|
when "html" then check_html(source, response)
|
|
when "rss" then check_rss(source, response) # optional, stub unten
|
|
else []
|
|
end
|
|
end
|
|
|
|
def fetch(source)
|
|
headers = {}
|
|
headers["If-None-Match"] = source.last_etag if source.last_etag.present?
|
|
headers["If-Modified-Since"] = source.last_modified if source.last_modified.present?
|
|
|
|
resp = @http.get(source.url, nil, headers)
|
|
|
|
if resp.status == 304
|
|
@logger.info("[TrainingWatch] source=#{source.id} not modified")
|
|
return nil
|
|
end
|
|
|
|
if resp.status >= 400
|
|
raise "HTTP #{resp.status}"
|
|
end
|
|
|
|
source.update!(
|
|
last_etag: resp.headers["etag"],
|
|
last_modified: resp.headers["last-modified"]
|
|
)
|
|
|
|
resp
|
|
end
|
|
|
|
def check_html(source, response)
|
|
html = response.body.to_s
|
|
doc = Nokogiri::HTML(html)
|
|
|
|
page_title = doc.at_css("title")&.text&.strip
|
|
text = doc.text.gsub(/\s+/, " ").strip
|
|
|
|
regex = compile_regex(source)
|
|
return [] unless text.match?(regex)
|
|
|
|
# Links sammeln, die auch matchen könnten (oder zumindest "2027" enthalten)
|
|
candidates = doc.css("a").map do |a|
|
|
href = a["href"].to_s.strip
|
|
next if href.empty?
|
|
|
|
label = a.text.to_s.gsub(/\s+/, " ").strip
|
|
abs = absolutize_url(source.url, href)
|
|
{ label:, url: abs }
|
|
end.compact
|
|
|
|
# Wenn Links explizit 2027 enthalten, priorisieren, sonst Seitenhit
|
|
link_hits = candidates.select { |c| (c[:label] + " " + c[:url]).match?(/\b2027\b/i) }.first(10)
|
|
|
|
hits = []
|
|
if link_hits.any?
|
|
link_hits.each do |c|
|
|
hits << create_hit!(
|
|
source:,
|
|
title: c[:label].presence || page_title,
|
|
hit_url: c[:url],
|
|
snippet: build_snippet(text, regex)
|
|
)
|
|
end
|
|
else
|
|
hits << create_hit!(
|
|
source:,
|
|
title: page_title || source.name,
|
|
hit_url: source.url,
|
|
snippet: build_snippet(text, regex)
|
|
)
|
|
end
|
|
|
|
hits.compact
|
|
end
|
|
|
|
# Optional: RSS support (wenn du später Feed-URLs nutzt)
|
|
def check_rss(source, response)
|
|
# Minimal: RSS als Text matchen (robust genug für "2027")
|
|
body = response.body.to_s
|
|
regex = compile_regex(source)
|
|
return [] unless body.match?(regex)
|
|
|
|
create = create_hit!(
|
|
source:,
|
|
title: source.name,
|
|
hit_url: source.url,
|
|
snippet: build_snippet(body.gsub(/\s+/, " "), regex)
|
|
)
|
|
create ? [create] : []
|
|
end
|
|
|
|
def compile_regex(source)
|
|
return DEFAULT_REGEX if source.match_regex.blank?
|
|
Regexp.new(source.match_regex, Regexp::IGNORECASE | Regexp::MULTILINE)
|
|
rescue RegexpError
|
|
DEFAULT_REGEX
|
|
end
|
|
|
|
def build_snippet(text, regex)
|
|
m = text.match(regex)
|
|
return text[0, 280] if m.nil?
|
|
|
|
start = [m.begin(0) - 120, 0].max
|
|
slice = text[start, 380] || text
|
|
slice.strip
|
|
end
|
|
|
|
def absolutize_url(base, href)
|
|
uri = URI.parse(href)
|
|
return uri.to_s if uri.absolute?
|
|
URI.join(base, href).to_s
|
|
rescue
|
|
href
|
|
end
|
|
|
|
def create_hit!(source:, title:, hit_url:, snippet:, published_at: nil)
|
|
fp = Digest::SHA256.hexdigest([source.id, title.to_s, hit_url.to_s, snippet.to_s[0, 200]].join("|"))
|
|
|
|
TrainingWatchHit.create!(
|
|
training_watch_source: source,
|
|
title: title.to_s.strip.presence,
|
|
hit_url: hit_url.to_s.strip.presence,
|
|
snippet: snippet.to_s.strip.presence,
|
|
published_at:,
|
|
fingerprint: fp
|
|
)
|
|
rescue ActiveRecord::RecordNotUnique
|
|
nil
|
|
end
|
|
end
|
|
end
|