add fetch new curses

2 months ago · 983497ad2e
14 changed files with 336 additions and 3 deletions
--- a/4
+++ b/4
@ -59,4 +59,6 @@ end

 gem 'devise'
 gem 'rufus-scheduler'
-gem 'pghero'
+gem 'pghero'
+gem "faraday"
+gem "nokogiri"
--- a/Gemfile.lock
+++ b/Gemfile.lock
@ -110,6 +110,12 @@ GEM
    erubi (1.13.1)
    et-orbi (1.4.0)
      tzinfo
+    faraday (2.14.0)
+      faraday-net_http (>= 2.0, < 3.5)
+      json
+      logger
+    faraday-net_http (3.4.2)
+      net-http (~> 0.5)
    ffi (1.17.2-x64-mingw32)
    ffi (1.17.2-x86_64-linux-gnu)
    fugit (1.12.1)
@ -133,6 +139,7 @@ GEM
      thor (>= 0.14, < 2.0)
    jquery-ui-rails (8.0.0)
      railties (>= 3.2.16)
+    json (2.17.1)
    kaminari (1.2.2)
      activesupport (>= 4.1.0)
      kaminari-actionview (= 1.2.2)
@ -160,6 +167,8 @@ GEM
    minitest (5.26.1)
    msgpack (1.8.0)
    mutex_m (0.3.0)
+    net-http (0.8.0)
+      uri (>= 0.11.1)
    net-imap (0.4.22)
      date
      net-protocol
@ -271,6 +280,7 @@ GEM
      concurrent-ruby (~> 1.0)
    tzinfo-data (1.2025.2)
      tzinfo (>= 1.0.0)
+    uri (1.1.1)
    warden (1.2.9)
      rack (>= 2.0.9)
    web-console (4.2.1)
@ -292,9 +302,11 @@ DEPENDENCIES
  administrate
  bootsnap
  devise
+  faraday
  jbuilder
  jquery-rails
  jquery-ui-rails
+  nokogiri
  pg (~> 1.1)
  pghero
  puma (>= 5.0)
--- a/app/jobs/training_watch_job.rb
+++ b/app/jobs/training_watch_job.rb
@ -0,0 +1,14 @@
+# app/jobs/training_watch_job.rb
+class TrainingWatchJob < ApplicationJob
+	queue_as :default
+	
+	def perform
+		results = TrainingWatch::Checker.new.run!
+		return if results.empty?
+		
+		email = ENV.fetch("TRAINING_WATCH_NOTIFY_EMAIL", "christoph@marzell.net")
+		return if email.blank?
+		
+		TrainingWatchMailer.notify(email: email, results: results).deliver_now
+	end
+end
--- a/app/mailers/training_watch_mailer.rb
+++ b/app/mailers/training_watch_mailer.rb
@ -0,0 +1,7 @@
+# app/mailers/training_watch_mailer.rb
+class TrainingWatchMailer < ApplicationMailer
+	def notify(email:, results:)
+		@results = results
+		mail(to: email, subject: "Lehrgänge 2027: neue Treffer gefunden")
+	end
+end
--- a/app/models/training_watch_hit.rb
+++ b/app/models/training_watch_hit.rb
@ -0,0 +1,7 @@
+# frozen_string_literal: true
+# app/models/training_watch_hit.rb
+class TrainingWatchHit < ApplicationRecord
+	belongs_to :training_watch_source
+	
+	validates :fingerprint, presence: true, uniqueness: true
+end
--- a/app/models/training_watch_source.rb
+++ b/app/models/training_watch_source.rb
@ -0,0 +1,10 @@
+# frozen_string_literal: true
+
+# app/models/training_watch_source.rb
+class TrainingWatchSource < ApplicationRecord
+	has_many :training_watch_hits, dependent: :delete_all
+	
+	validates :name, :url, :kind, presence: true
+	validates :url, uniqueness: true
+	validates :kind, inclusion: { in: %w[html rss] }
+end
--- a/app/services/training_watch/checker.rb
+++ b/app/services/training_watch/checker.rb
@ -0,0 +1,183 @@
+# app/services/training_watch/checker.rb
+require "digest"
+require "faraday"
+require "nokogiri"
+
+module TrainingWatch
+	class Checker
+		DEFAULT_REGEX = /
+      \b2027\b
+      .*?
+      (start|beginn|lehrgang|curriculum|gruppe|kurs|aufnahme|bewerb|anmeld|termin)
+    /imx
+		
+		USER_AGENT = "RailsTrainingWatch/1.0 (+https://your-app.example)"
+		
+		Result = Struct.new(:source, :new_hits, keyword_init: true)
+		
+		def initialize(sources: TrainingWatchSource.where(enabled: true), logger: Rails.logger)
+			@sources = sources
+			@logger = logger
+			@http = Faraday.new do |f|
+				f.options.timeout = 15
+				f.options.open_timeout = 10
+				f.headers["User-Agent"] = USER_AGENT
+				f.adapter Faraday.default_adapter
+			end
+		end
+		
+		def run!
+			results = []
+			
+			@sources.find_each do |source|
+				begin
+					new_hits = check_one!(source)
+					results << Result.new(source:, new_hits:) if new_hits.any?
+				rescue => e
+					@logger.warn("[TrainingWatch] source=#{source.id} #{source.name} failed: #{e.class}: #{e.message}")
+				ensure
+					source.update!(last_checked_at: Time.current)
+				end
+			end
+			
+			results
+		end
+		
+		private
+		
+		def check_one!(source)
+			response = fetch(source)
+			return [] if response.nil?
+			
+			case source.kind
+				when "html" then check_html(source, response)
+				when "rss"  then check_rss(source, response) # optional, stub unten
+				else []
+			end
+		end
+		
+		def fetch(source)
+			headers = {}
+			headers["If-None-Match"] = source.last_etag if source.last_etag.present?
+			headers["If-Modified-Since"] = source.last_modified if source.last_modified.present?
+			
+			resp = @http.get(source.url, nil, headers)
+			
+			if resp.status == 304
+				@logger.info("[TrainingWatch] source=#{source.id} not modified")
+				return nil
+			end
+			
+			if resp.status >= 400
+				raise "HTTP #{resp.status}"
+			end
+			
+			source.update!(
+				last_etag: resp.headers["etag"],
+				last_modified: resp.headers["last-modified"]
+			)
+			
+			resp
+		end
+		
+		def check_html(source, response)
+			html = response.body.to_s
+			doc = Nokogiri::HTML(html)
+			
+			page_title = doc.at_css("title")&.text&.strip
+			text = doc.text.gsub(/\s+/, " ").strip
+			
+			regex = compile_regex(source)
+			return [] unless text.match?(regex)
+			
+			# Links sammeln, die auch matchen könnten (oder zumindest "2027" enthalten)
+			candidates = doc.css("a").map do |a|
+				href = a["href"].to_s.strip
+				next if href.empty?
+				
+				label = a.text.to_s.gsub(/\s+/, " ").strip
+				abs = absolutize_url(source.url, href)
+				{ label:, url: abs }
+			end.compact
+			
+			# Wenn Links explizit 2027 enthalten, priorisieren, sonst Seitenhit
+			link_hits = candidates.select { |c| (c[:label] + " " + c[:url]).match?(/\b2027\b/i) }.first(10)
+			
+			hits = []
+			if link_hits.any?
+				link_hits.each do |c|
+					hits << create_hit!(
+						source:,
+						title: c[:label].presence || page_title,
+						hit_url: c[:url],
+						snippet: build_snippet(text, regex)
+					)
+				end
+			else
+				hits << create_hit!(
+					source:,
+					title: page_title || source.name,
+					hit_url: source.url,
+					snippet: build_snippet(text, regex)
+				)
+			end
+			
+			hits.compact
+		end
+		
+		# Optional: RSS support (wenn du später Feed-URLs nutzt)
+		def check_rss(source, response)
+			# Minimal: RSS als Text matchen (robust genug für "2027")
+			body = response.body.to_s
+			regex = compile_regex(source)
+			return [] unless body.match?(regex)
+			
+			create = create_hit!(
+				source:,
+				title: source.name,
+				hit_url: source.url,
+				snippet: build_snippet(body.gsub(/\s+/, " "), regex)
+			)
+			create ? [create] : []
+		end
+		
+		def compile_regex(source)
+			return DEFAULT_REGEX if source.match_regex.blank?
+			Regexp.new(source.match_regex, Regexp::IGNORECASE | Regexp::MULTILINE)
+		rescue RegexpError
+			DEFAULT_REGEX
+		end
+		
+		def build_snippet(text, regex)
+			m = text.match(regex)
+			return text[0, 280] if m.nil?
+			
+			start = [m.begin(0) - 120, 0].max
+			slice = text[start, 380] || text
+			slice.strip
+		end
+		
+		def absolutize_url(base, href)
+			uri = URI.parse(href)
+			return uri.to_s if uri.absolute?
+			URI.join(base, href).to_s
+		rescue
+			href
+		end
+		
+		def create_hit!(source:, title:, hit_url:, snippet:, published_at: nil)
+			fp = Digest::SHA256.hexdigest([source.id, title.to_s, hit_url.to_s, snippet.to_s[0, 200]].join("|"))
+			
+			TrainingWatchHit.create!(
+				training_watch_source: source,
+				title: title.to_s.strip.presence,
+				hit_url: hit_url.to_s.strip.presence,
+				snippet: snippet.to_s.strip.presence,
+				published_at:,
+				fingerprint: fp
+			)
+		rescue ActiveRecord::RecordNotUnique
+			nil
+		end
+	end
+end
--- a/app/views/layouts/application.html.erb
+++ b/app/views/layouts/application.html.erb
@ -17,7 +17,7 @@

  <!-- JS: Bootstrap Bundle (inkl. Popper) -->
  <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/js/bootstrap.bundle.min.js" defer></script>
-
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/Chart.js/4.5.0/chart.min.js" integrity="sha512-n/G+dROKbKL3GVngGWmWfwK0yPctjZQM752diVYnXZtD/48agpUKLIn0xDQL9ydZ91x6BiOmTIFwWjjFi2kEFg==" crossorigin="anonymous" referrerpolicy="no-referrer"></script>
  <script>
      document.addEventListener('DOMContentLoaded', function () {
          const modalElement = document.getElementById('deleteConfirmModal');
--- a/app/views/training_watch_mailer/notify.text.erb
+++ b/app/views/training_watch_mailer/notify.text.erb
@ -0,0 +1,12 @@
+<!-- app/views/training_watch_mailer/notify.text.erb -->
+Neue Treffer für Lehrgänge Start 2027:
+
+<% @results.each do |r| %>
+  Quelle: <%= r.source.name %> (<%= r.source.url %>)
+  <% r.new_hits.each do |h| %>
+    - <%= h.title.presence || "Treffer" %>
+    <%= h.hit_url %>
+    <%= h.snippet %>
+  <% end %>
+
+<% end %>
--- a/config/initializers/dump_scheduler.rb
+++ b/config/initializers/dump_scheduler.rb
@ -10,3 +10,9 @@ scheduler.cron '0 2 * * *' do
 	Rails.logger.info "[Scheduler] Starte täglichen Dump-Versand"
 	DailyDumpJob.perform_later
 end
+
+
+scheduler.cron '0 0 * * *' do
+	Rails.logger.info "[Scheduler] Starte TrainingWatch"
+	TrainingWatch.perform_later
+end
--- a/db/migrate/20251207054128_create_training_watch_sources.rb
+++ b/db/migrate/20251207054128_create_training_watch_sources.rb
@ -0,0 +1,18 @@
+class CreateTrainingWatchSources < ActiveRecord::Migration[7.1]
+  def change
+    create_table :training_watch_sources do |t|
+      t.string  :name, null: false
+      t.string  :url, null: false
+      t.string  :kind, null: false, default: "html" # html | rss (optional)
+      t.string  :match_regex # optional: überschreibt Default-Matcher
+      t.string  :last_etag
+      t.string  :last_modified
+      t.datetime :last_checked_at
+      t.boolean :enabled, null: false, default: true
+      t.timestamps
+    end
+    
+    add_index :training_watch_sources, :enabled
+    add_index :training_watch_sources, :url, unique: true
+  end
+end
--- a/db/migrate/20251207054146_create_training_watch_hits.rb
+++ b/db/migrate/20251207054146_create_training_watch_hits.rb
@ -0,0 +1,16 @@
+class CreateTrainingWatchHits < ActiveRecord::Migration[7.1]
+  def change
+    create_table :training_watch_hits do |t|
+      t.references :training_watch_source, null: false, foreign_key: true
+      t.string :title
+      t.string :hit_url
+      t.datetime :published_at
+      t.text :snippet
+      t.string :fingerprint, null: false
+      t.timestamps
+    end
+    
+    add_index :training_watch_hits, :fingerprint, unique: true
+    add_index :training_watch_hits, :created_at
+  end
+end
--- a/db/schema.rb
+++ b/db/schema.rb
@ -10,7 +10,7 @@
 #
 # It's strongly recommended that you check this file into your version control system.

-ActiveRecord::Schema[7.1].define(version: 2025_12_01_050251) do
+ActiveRecord::Schema[7.1].define(version: 2025_12_07_054146) do
  # These are extensions that must be enabled in order to support this database
  enable_extension "pg_stat_statements"
  enable_extension "plpgsql"
@ -51,6 +51,35 @@ ActiveRecord::Schema[7.1].define(version: 2025_12_01_050251) do
    t.index ["year"], name: "index_mileage_rates_on_year", unique: true
  end

+  create_table "training_watch_hits", force: :cascade do |t|
+    t.bigint "training_watch_source_id", null: false
+    t.string "title"
+    t.string "hit_url"
+    t.datetime "published_at"
+    t.text "snippet"
+    t.string "fingerprint", null: false
+    t.datetime "created_at", null: false
+    t.datetime "updated_at", null: false
+    t.index ["created_at"], name: "index_training_watch_hits_on_created_at"
+    t.index ["fingerprint"], name: "index_training_watch_hits_on_fingerprint", unique: true
+    t.index ["training_watch_source_id"], name: "index_training_watch_hits_on_training_watch_source_id"
+  end
+
+  create_table "training_watch_sources", force: :cascade do |t|
+    t.string "name", null: false
+    t.string "url", null: false
+    t.string "kind", default: "html", null: false
+    t.string "match_regex"
+    t.string "last_etag"
+    t.string "last_modified"
+    t.datetime "last_checked_at"
+    t.boolean "enabled", default: true, null: false
+    t.datetime "created_at", null: false
+    t.datetime "updated_at", null: false
+    t.index ["enabled"], name: "index_training_watch_sources_on_enabled"
+    t.index ["url"], name: "index_training_watch_sources_on_url", unique: true
+  end
+
  create_table "users", force: :cascade do |t|
    t.string "email", default: "", null: false
    t.string "encrypted_password", default: "", null: false
@ -75,4 +104,5 @@ ActiveRecord::Schema[7.1].define(version: 2025_12_01_050251) do
  end

  add_foreign_key "entries", "users"
+  add_foreign_key "training_watch_hits", "training_watch_sources"
 end
--- a/db/seeds.rb
+++ b/db/seeds.rb
@ -7,3 +7,19 @@
 #   ["Action", "Comedy", "Drama", "Horror"].each do |genre_name|
 #     MovieGenre.find_or_create_by!(name: genre_name)
 #   end
+# db/seeds.rb (Ausschnitt)
+TrainingWatchSource.find_or_create_by!(url: "https://oeas.at/ausbildung/curricula-wien") do |s|
+	s.name = "ÖAS Curricula Wien"
+end
+
+TrainingWatchSource.find_or_create_by!(url: "https://www.la-sf.at/") do |s|
+	s.name = "la:sf"
+end
+
+TrainingWatchSource.find_or_create_by!(url: "https://www.oeagg.at/") do |s|
+	s.name = "ÖAGG"
+end
+
+TrainingWatchSource.find_or_create_by!(url: "https://www.sfu.ac.at/") do |s|
+	s.name = "SFU"
+end