# app/services/training_watch/checker.rb
require "digest"
require "faraday"
require "nokogiri"

module TrainingWatch
	class Checker
		DEFAULT_REGEX = /
      \b2027\b
      .*?
      (start|beginn|lehrgang|curriculum|gruppe|kurs|aufnahme|bewerb|anmeld|termin)
    /imx
		
		USER_AGENT = "RailsTrainingWatch/1.0"
		
		Result = Struct.new(:source, :new_hits, keyword_init: true)
		
		def initialize(sources: TrainingWatchSource.where(enabled: true), logger: Rails.logger)
			@sources = sources
			@logger = logger
			@http = Faraday.new do |f|
				f.options.timeout = 15
				f.options.open_timeout = 10
				f.headers["User-Agent"] = USER_AGENT
				f.adapter Faraday.default_adapter
			end
		end
		
		def run!
			results = []
			
			@sources.find_each do |source|
				begin
					new_hits = check_one!(source)
					results << Result.new(source: source, new_hits: new_hits) if new_hits.any?
				rescue => e
					@logger.warn("[TrainingWatch] source=#{source.id} #{source.name} failed: #{e.class}: #{e.message}")
				ensure
					source.update!(last_checked_at: Time.current)
				end
			end
			
			results
		end
		
		private
		
		def check_one!(source)
			response = fetch(source)
			return [] if response.nil?
			
			case source.kind
				when "html" then check_html(source, response)
				when "rss"  then check_rss(source, response) # optional, stub unten
				else []
			end
		end
		
		def fetch(source)
			headers = {}
			headers["If-None-Match"] = source.last_etag if source.last_etag.present?
			headers["If-Modified-Since"] = source.last_modified if source.last_modified.present?
			
			resp = @http.get(source.url, nil, headers)
			
			if resp.status == 304
				@logger.info("[TrainingWatch] source=#{source.id} not modified")
				return nil
			end
			
			if resp.status >= 400
				raise "HTTP #{resp.status}"
			end
			
			source.update!(
				last_etag: resp.headers["etag"],
				last_modified: resp.headers["last-modified"]
			)
			
			resp
		end
		
		def check_html(source, response)
			html = response.body.to_s
			doc = Nokogiri::HTML(html)
			
			page_title = doc.at_css("title")&.text&.strip
			text = doc.text.gsub(/\s+/, " ").strip
			
			regex = compile_regex(source)
			return [] unless text.match?(regex)
			
			# Links sammeln, die auch matchen könnten (oder zumindest "2027" enthalten)
			candidates = doc.css("a").map do |a|
				href = a["href"].to_s.strip
				next if href.empty?
				
				label = a.text.to_s.gsub(/\s+/, " ").strip
				abs = absolutize_url(source.url, href)
				{ label: label, url: abs }
			end.compact
			
			# Wenn Links explizit 2027 enthalten, priorisieren, sonst Seitenhit
			link_hits = candidates.select { |c| (c[:label] + " " + c[:url]).match?(/\b2027\b/i) }.first(10)
			
			hits = []
			if link_hits.any?
				link_hits.each do |c|
					hits << create_hit!(
						source: source,
						title: c[:label].presence || page_title,
						hit_url: c[:url],
						snippet: build_snippet(text, regex)
					)
				end
			else
				hits << create_hit!(
					source: source,
					title: page_title || source.name,
					hit_url: source.url,
					snippet: build_snippet(text, regex)
				)
			end
			
			hits.compact
		end
		
		# Optional: RSS support (wenn du später Feed-URLs nutzt)
		def check_rss(source, response)
			# Minimal: RSS als Text matchen (robust genug für "2027")
			body = response.body.to_s
			regex = compile_regex(source)
			return [] unless body.match?(regex)
			
			create = create_hit!(
				source: source,
				title: source.name,
				hit_url: source.url,
				snippet: build_snippet(body.gsub(/\s+/, " "), regex)
			)
			create ? [create] : []
		end
		
		def compile_regex(source)
			return DEFAULT_REGEX if source.match_regex.blank?
			Regexp.new(source.match_regex, Regexp::IGNORECASE | Regexp::MULTILINE)
		rescue RegexpError
			DEFAULT_REGEX
		end
		
		def build_snippet(text, regex)
			m = text.match(regex)
			return text[0, 280] if m.nil?
			
			start = [m.begin(0) - 120, 0].max
			slice = text[start, 380] || text
			slice.strip
		end
		
		def absolutize_url(base, href)
			uri = URI.parse(href)
			return uri.to_s if uri.absolute?
			URI.join(base, href).to_s
		rescue
			href
		end
		
		def create_hit!(source:, title:, hit_url:, snippet:, published_at: nil)
			fp = Digest::SHA256.hexdigest([source.id, title.to_s, hit_url.to_s, snippet.to_s[0, 200]].join("|"))
			
			TrainingWatchHit.create!(
				training_watch_source: source,
				title: title.to_s.strip.presence,
				hit_url: hit_url.to_s.strip.presence,
				snippet: snippet.to_s.strip.presence,
				published_at: published_at,
				fingerprint: fp
			)
		rescue ActiveRecord::RecordNotUnique
			nil
		end
	end
end