Browse Source

add fetch new curses

main
Christoph Marzell 5 days ago
parent
commit
983497ad2e
  1. 2
      Gemfile
  2. 12
      Gemfile.lock
  3. 14
      app/jobs/training_watch_job.rb
  4. 7
      app/mailers/training_watch_mailer.rb
  5. 7
      app/models/training_watch_hit.rb
  6. 10
      app/models/training_watch_source.rb
  7. 183
      app/services/training_watch/checker.rb
  8. 2
      app/views/layouts/application.html.erb
  9. 12
      app/views/training_watch_mailer/notify.text.erb
  10. 6
      config/initializers/dump_scheduler.rb
  11. 18
      db/migrate/20251207054128_create_training_watch_sources.rb
  12. 16
      db/migrate/20251207054146_create_training_watch_hits.rb
  13. 32
      db/schema.rb
  14. 16
      db/seeds.rb

2
Gemfile

@ -60,3 +60,5 @@ end
gem 'devise' gem 'devise'
gem 'rufus-scheduler' gem 'rufus-scheduler'
gem 'pghero' gem 'pghero'
gem "faraday"
gem "nokogiri"

12
Gemfile.lock

@ -110,6 +110,12 @@ GEM
erubi (1.13.1) erubi (1.13.1)
et-orbi (1.4.0) et-orbi (1.4.0)
tzinfo tzinfo
faraday (2.14.0)
faraday-net_http (>= 2.0, < 3.5)
json
logger
faraday-net_http (3.4.2)
net-http (~> 0.5)
ffi (1.17.2-x64-mingw32) ffi (1.17.2-x64-mingw32)
ffi (1.17.2-x86_64-linux-gnu) ffi (1.17.2-x86_64-linux-gnu)
fugit (1.12.1) fugit (1.12.1)
@ -133,6 +139,7 @@ GEM
thor (>= 0.14, < 2.0) thor (>= 0.14, < 2.0)
jquery-ui-rails (8.0.0) jquery-ui-rails (8.0.0)
railties (>= 3.2.16) railties (>= 3.2.16)
json (2.17.1)
kaminari (1.2.2) kaminari (1.2.2)
activesupport (>= 4.1.0) activesupport (>= 4.1.0)
kaminari-actionview (= 1.2.2) kaminari-actionview (= 1.2.2)
@ -160,6 +167,8 @@ GEM
minitest (5.26.1) minitest (5.26.1)
msgpack (1.8.0) msgpack (1.8.0)
mutex_m (0.3.0) mutex_m (0.3.0)
net-http (0.8.0)
uri (>= 0.11.1)
net-imap (0.4.22) net-imap (0.4.22)
date date
net-protocol net-protocol
@ -271,6 +280,7 @@ GEM
concurrent-ruby (~> 1.0) concurrent-ruby (~> 1.0)
tzinfo-data (1.2025.2) tzinfo-data (1.2025.2)
tzinfo (>= 1.0.0) tzinfo (>= 1.0.0)
uri (1.1.1)
warden (1.2.9) warden (1.2.9)
rack (>= 2.0.9) rack (>= 2.0.9)
web-console (4.2.1) web-console (4.2.1)
@ -292,9 +302,11 @@ DEPENDENCIES
administrate administrate
bootsnap bootsnap
devise devise
faraday
jbuilder jbuilder
jquery-rails jquery-rails
jquery-ui-rails jquery-ui-rails
nokogiri
pg (~> 1.1) pg (~> 1.1)
pghero pghero
puma (>= 5.0) puma (>= 5.0)

14
app/jobs/training_watch_job.rb

@ -0,0 +1,14 @@
# app/jobs/training_watch_job.rb
class TrainingWatchJob < ApplicationJob
queue_as :default
def perform
results = TrainingWatch::Checker.new.run!
return if results.empty?
email = ENV.fetch("TRAINING_WATCH_NOTIFY_EMAIL", "christoph@marzell.net")
return if email.blank?
TrainingWatchMailer.notify(email: email, results: results).deliver_now
end
end

7
app/mailers/training_watch_mailer.rb

@ -0,0 +1,7 @@
# app/mailers/training_watch_mailer.rb
class TrainingWatchMailer < ApplicationMailer
def notify(email:, results:)
@results = results
mail(to: email, subject: "Lehrgänge 2027: neue Treffer gefunden")
end
end

7
app/models/training_watch_hit.rb

@ -0,0 +1,7 @@
# frozen_string_literal: true
# app/models/training_watch_hit.rb
class TrainingWatchHit < ApplicationRecord
belongs_to :training_watch_source
validates :fingerprint, presence: true, uniqueness: true
end

10
app/models/training_watch_source.rb

@ -0,0 +1,10 @@
# frozen_string_literal: true
# app/models/training_watch_source.rb
class TrainingWatchSource < ApplicationRecord
has_many :training_watch_hits, dependent: :delete_all
validates :name, :url, :kind, presence: true
validates :url, uniqueness: true
validates :kind, inclusion: { in: %w[html rss] }
end

183
app/services/training_watch/checker.rb

@ -0,0 +1,183 @@
# app/services/training_watch/checker.rb
require "digest"
require "faraday"
require "nokogiri"
module TrainingWatch
class Checker
DEFAULT_REGEX = /
\b2027\b
.*?
(start|beginn|lehrgang|curriculum|gruppe|kurs|aufnahme|bewerb|anmeld|termin)
/imx
USER_AGENT = "RailsTrainingWatch/1.0 (+https://your-app.example)"
Result = Struct.new(:source, :new_hits, keyword_init: true)
def initialize(sources: TrainingWatchSource.where(enabled: true), logger: Rails.logger)
@sources = sources
@logger = logger
@http = Faraday.new do |f|
f.options.timeout = 15
f.options.open_timeout = 10
f.headers["User-Agent"] = USER_AGENT
f.adapter Faraday.default_adapter
end
end
def run!
results = []
@sources.find_each do |source|
begin
new_hits = check_one!(source)
results << Result.new(source:, new_hits:) if new_hits.any?
rescue => e
@logger.warn("[TrainingWatch] source=#{source.id} #{source.name} failed: #{e.class}: #{e.message}")
ensure
source.update!(last_checked_at: Time.current)
end
end
results
end
private
def check_one!(source)
response = fetch(source)
return [] if response.nil?
case source.kind
when "html" then check_html(source, response)
when "rss" then check_rss(source, response) # optional, stub unten
else []
end
end
def fetch(source)
headers = {}
headers["If-None-Match"] = source.last_etag if source.last_etag.present?
headers["If-Modified-Since"] = source.last_modified if source.last_modified.present?
resp = @http.get(source.url, nil, headers)
if resp.status == 304
@logger.info("[TrainingWatch] source=#{source.id} not modified")
return nil
end
if resp.status >= 400
raise "HTTP #{resp.status}"
end
source.update!(
last_etag: resp.headers["etag"],
last_modified: resp.headers["last-modified"]
)
resp
end
def check_html(source, response)
html = response.body.to_s
doc = Nokogiri::HTML(html)
page_title = doc.at_css("title")&.text&.strip
text = doc.text.gsub(/\s+/, " ").strip
regex = compile_regex(source)
return [] unless text.match?(regex)
# Links sammeln, die auch matchen könnten (oder zumindest "2027" enthalten)
candidates = doc.css("a").map do |a|
href = a["href"].to_s.strip
next if href.empty?
label = a.text.to_s.gsub(/\s+/, " ").strip
abs = absolutize_url(source.url, href)
{ label:, url: abs }
end.compact
# Wenn Links explizit 2027 enthalten, priorisieren, sonst Seitenhit
link_hits = candidates.select { |c| (c[:label] + " " + c[:url]).match?(/\b2027\b/i) }.first(10)
hits = []
if link_hits.any?
link_hits.each do |c|
hits << create_hit!(
source:,
title: c[:label].presence || page_title,
hit_url: c[:url],
snippet: build_snippet(text, regex)
)
end
else
hits << create_hit!(
source:,
title: page_title || source.name,
hit_url: source.url,
snippet: build_snippet(text, regex)
)
end
hits.compact
end
# Optional: RSS support (wenn du später Feed-URLs nutzt)
def check_rss(source, response)
# Minimal: RSS als Text matchen (robust genug für "2027")
body = response.body.to_s
regex = compile_regex(source)
return [] unless body.match?(regex)
create = create_hit!(
source:,
title: source.name,
hit_url: source.url,
snippet: build_snippet(body.gsub(/\s+/, " "), regex)
)
create ? [create] : []
end
def compile_regex(source)
return DEFAULT_REGEX if source.match_regex.blank?
Regexp.new(source.match_regex, Regexp::IGNORECASE | Regexp::MULTILINE)
rescue RegexpError
DEFAULT_REGEX
end
def build_snippet(text, regex)
m = text.match(regex)
return text[0, 280] if m.nil?
start = [m.begin(0) - 120, 0].max
slice = text[start, 380] || text
slice.strip
end
def absolutize_url(base, href)
uri = URI.parse(href)
return uri.to_s if uri.absolute?
URI.join(base, href).to_s
rescue
href
end
def create_hit!(source:, title:, hit_url:, snippet:, published_at: nil)
fp = Digest::SHA256.hexdigest([source.id, title.to_s, hit_url.to_s, snippet.to_s[0, 200]].join("|"))
TrainingWatchHit.create!(
training_watch_source: source,
title: title.to_s.strip.presence,
hit_url: hit_url.to_s.strip.presence,
snippet: snippet.to_s.strip.presence,
published_at:,
fingerprint: fp
)
rescue ActiveRecord::RecordNotUnique
nil
end
end
end

2
app/views/layouts/application.html.erb

@ -17,7 +17,7 @@
<!-- JS: Bootstrap Bundle (inkl. Popper) --> <!-- JS: Bootstrap Bundle (inkl. Popper) -->
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/js/bootstrap.bundle.min.js" defer></script> <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/js/bootstrap.bundle.min.js" defer></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/Chart.js/4.5.0/chart.min.js" integrity="sha512-n/G+dROKbKL3GVngGWmWfwK0yPctjZQM752diVYnXZtD/48agpUKLIn0xDQL9ydZ91x6BiOmTIFwWjjFi2kEFg==" crossorigin="anonymous" referrerpolicy="no-referrer"></script>
<script> <script>
document.addEventListener('DOMContentLoaded', function () { document.addEventListener('DOMContentLoaded', function () {
const modalElement = document.getElementById('deleteConfirmModal'); const modalElement = document.getElementById('deleteConfirmModal');

12
app/views/training_watch_mailer/notify.text.erb

@ -0,0 +1,12 @@
<!-- app/views/training_watch_mailer/notify.text.erb -->
Neue Treffer für Lehrgänge Start 2027:
<% @results.each do |r| %>
Quelle: <%= r.source.name %> (<%= r.source.url %>)
<% r.new_hits.each do |h| %>
- <%= h.title.presence || "Treffer" %>
<%= h.hit_url %>
<%= h.snippet %>
<% end %>
<% end %>

6
config/initializers/dump_scheduler.rb

@ -10,3 +10,9 @@ scheduler.cron '0 2 * * *' do
Rails.logger.info "[Scheduler] Starte täglichen Dump-Versand" Rails.logger.info "[Scheduler] Starte täglichen Dump-Versand"
DailyDumpJob.perform_later DailyDumpJob.perform_later
end end
scheduler.cron '0 0 * * *' do
Rails.logger.info "[Scheduler] Starte TrainingWatch"
TrainingWatch.perform_later
end

18
db/migrate/20251207054128_create_training_watch_sources.rb

@ -0,0 +1,18 @@
class CreateTrainingWatchSources < ActiveRecord::Migration[7.1]
def change
create_table :training_watch_sources do |t|
t.string :name, null: false
t.string :url, null: false
t.string :kind, null: false, default: "html" # html | rss (optional)
t.string :match_regex # optional: überschreibt Default-Matcher
t.string :last_etag
t.string :last_modified
t.datetime :last_checked_at
t.boolean :enabled, null: false, default: true
t.timestamps
end
add_index :training_watch_sources, :enabled
add_index :training_watch_sources, :url, unique: true
end
end

16
db/migrate/20251207054146_create_training_watch_hits.rb

@ -0,0 +1,16 @@
class CreateTrainingWatchHits < ActiveRecord::Migration[7.1]
def change
create_table :training_watch_hits do |t|
t.references :training_watch_source, null: false, foreign_key: true
t.string :title
t.string :hit_url
t.datetime :published_at
t.text :snippet
t.string :fingerprint, null: false
t.timestamps
end
add_index :training_watch_hits, :fingerprint, unique: true
add_index :training_watch_hits, :created_at
end
end

32
db/schema.rb

@ -10,7 +10,7 @@
# #
# It's strongly recommended that you check this file into your version control system. # It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema[7.1].define(version: 2025_12_01_050251) do
ActiveRecord::Schema[7.1].define(version: 2025_12_07_054146) do
# These are extensions that must be enabled in order to support this database # These are extensions that must be enabled in order to support this database
enable_extension "pg_stat_statements" enable_extension "pg_stat_statements"
enable_extension "plpgsql" enable_extension "plpgsql"
@ -51,6 +51,35 @@ ActiveRecord::Schema[7.1].define(version: 2025_12_01_050251) do
t.index ["year"], name: "index_mileage_rates_on_year", unique: true t.index ["year"], name: "index_mileage_rates_on_year", unique: true
end end
create_table "training_watch_hits", force: :cascade do |t|
t.bigint "training_watch_source_id", null: false
t.string "title"
t.string "hit_url"
t.datetime "published_at"
t.text "snippet"
t.string "fingerprint", null: false
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
t.index ["created_at"], name: "index_training_watch_hits_on_created_at"
t.index ["fingerprint"], name: "index_training_watch_hits_on_fingerprint", unique: true
t.index ["training_watch_source_id"], name: "index_training_watch_hits_on_training_watch_source_id"
end
create_table "training_watch_sources", force: :cascade do |t|
t.string "name", null: false
t.string "url", null: false
t.string "kind", default: "html", null: false
t.string "match_regex"
t.string "last_etag"
t.string "last_modified"
t.datetime "last_checked_at"
t.boolean "enabled", default: true, null: false
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
t.index ["enabled"], name: "index_training_watch_sources_on_enabled"
t.index ["url"], name: "index_training_watch_sources_on_url", unique: true
end
create_table "users", force: :cascade do |t| create_table "users", force: :cascade do |t|
t.string "email", default: "", null: false t.string "email", default: "", null: false
t.string "encrypted_password", default: "", null: false t.string "encrypted_password", default: "", null: false
@ -75,4 +104,5 @@ ActiveRecord::Schema[7.1].define(version: 2025_12_01_050251) do
end end
add_foreign_key "entries", "users" add_foreign_key "entries", "users"
add_foreign_key "training_watch_hits", "training_watch_sources"
end end

16
db/seeds.rb

@ -7,3 +7,19 @@
# ["Action", "Comedy", "Drama", "Horror"].each do |genre_name| # ["Action", "Comedy", "Drama", "Horror"].each do |genre_name|
# MovieGenre.find_or_create_by!(name: genre_name) # MovieGenre.find_or_create_by!(name: genre_name)
# end # end
# db/seeds.rb (Ausschnitt)
TrainingWatchSource.find_or_create_by!(url: "https://oeas.at/ausbildung/curricula-wien") do |s|
s.name = "ÖAS Curricula Wien"
end
TrainingWatchSource.find_or_create_by!(url: "https://www.la-sf.at/") do |s|
s.name = "la:sf"
end
TrainingWatchSource.find_or_create_by!(url: "https://www.oeagg.at/") do |s|
s.name = "ÖAGG"
end
TrainingWatchSource.find_or_create_by!(url: "https://www.sfu.ac.at/") do |s|
s.name = "SFU"
end
Loading…
Cancel
Save