From 04a424b4bbf10da3e495ca0a57a668aba56a312d Mon Sep 17 00:00:00 2001 From: Mahad Kalam Date: Tue, 24 Mar 2026 10:18:36 +0000 Subject: [PATCH 01/24] ClickHouse? --- .env.example | 6 + DEVELOPMENT.md | 3 + Gemfile | 2 + .../api/admin/v1/admin_controller.rb | 102 ++++------ .../api/hackatime/v1/hackatime_controller.rb | 2 +- .../api/v1/authenticated/hours_controller.rb | 2 +- .../concerns/api/admin/v1/user_utilities.rb | 2 +- app/jobs/cache/active_projects_job.rb | 34 +++- app/jobs/cache/active_users_graph_data_job.rb | 25 +-- app/jobs/cache/currently_hacking_count_job.rb | 7 +- app/jobs/cache/currently_hacking_job.rb | 29 +-- app/jobs/cache/home_stats_job.rb | 12 +- app/jobs/leaderboard_update_job.rb | 10 +- .../recalc_heartbeat_field_hash_job.rb | 2 +- app/jobs/sync_all_user_repo_events_job.rb | 9 +- app/jobs/weekly_summary_email_job.rb | 15 +- app/mailers/weekly_summary_mailer.rb | 10 +- app/models/concerns/heartbeatable.rb | 180 ++++++++---------- app/models/heartbeat.rb | 34 +--- app/models/user.rb | 1 + app/services/anonymize_user_service.rb | 4 +- app/services/heartbeat_import_service.rb | 4 +- app/services/leaderboard_builder.rb | 7 +- app/services/profile_stats_service.rb | 21 +- app/services/timeline_service.rb | 4 +- config/database.yml | 24 +++ docker-compose.yml | 26 ++- lib/tasks/seed_dummy_users.rake | 2 +- .../previews/weekly_summary_mailer_preview.rb | 4 +- test/models/heartbeat_test.rb | 12 +- test/services/anonymize_user_service_test.rb | 6 +- 31 files changed, 314 insertions(+), 287 deletions(-) diff --git a/.env.example b/.env.example index 0192c6534..575c01fe1 100644 --- a/.env.example +++ b/.env.example @@ -60,3 +60,9 @@ S3_ACCESS_KEY_ID=your_s3_access_key_id_here S3_SECRET_ACCESS_KEY=your_s3_secret_access_key_here S3_BUCKET=your_s3_bucket_name_here S3_ENDPOINT=https://.r2.cloudflarestorage.com + +# ClickHouse database +CLICKHOUSE_URL=http://clickhouse:8123 +CLICKHOUSE_DATABASE=hackatime_development +CLICKHOUSE_USERNAME=default +CLICKHOUSE_PASSWORD= diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 18cf369ec..90f56c2e9 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -40,6 +40,8 @@ HCA_CLIENT_ID= HCA_CLIENT_SECRET= ``` +ClickHouse is automatically started by Docker Compose alongside Postgres — no extra setup needed. + Start the containers: ```sh @@ -51,6 +53,7 @@ We'll now setup the database. In your container shell, run the following: ```bash app# bin/rails db:create db:schema:load db:seed +app# bin/rails db:migrate:clickhouse ``` Run the Vite build with SSR (server-side-rendering): diff --git a/Gemfile b/Gemfile index 4a01d0745..4c4fc89c4 100644 --- a/Gemfile +++ b/Gemfile @@ -9,6 +9,8 @@ gem "propshaft" gem "sqlite3", ">= 2.1" # Use PostgreSQL as the database for Wakatime gem "pg" +# Use ClickHouse for analytics +gem "clickhouse-activerecord" # Use the Puma web server [https://github.com/puma/puma] gem "puma", ">= 5.0" # Use JavaScript with ESM import maps [https://github.com/rails/importmap-rails] diff --git a/app/controllers/api/admin/v1/admin_controller.rb b/app/controllers/api/admin/v1/admin_controller.rb index a69716147..73e855e9d 100644 --- a/app/controllers/api/admin/v1/admin_controller.rb +++ b/app/controllers/api/admin/v1/admin_controller.rb @@ -51,71 +51,77 @@ def visualization_quantized quantized_query = <<-SQL WITH base_heartbeats AS ( SELECT - "time", + time, lineno, cursorpos, - date_trunc('day', to_timestamp("time")) as day_start + toDate(toDateTime(toUInt32(time))) as day_start FROM heartbeats WHERE user_id = ? - AND "time" >= ? AND "time" <= ? + AND time >= ? AND time <= ? AND (lineno IS NOT NULL OR cursorpos IS NOT NULL) LIMIT 1000000 ), daily_stats AS ( SELECT *, - GREATEST(1, MAX(lineno) OVER (PARTITION BY day_start)) as max_lineno, - GREATEST(1, MAX(cursorpos) OVER (PARTITION BY day_start)) as max_cursorpos + greatest(1, max(lineno) OVER (PARTITION BY day_start)) as max_lineno, + greatest(1, max(cursorpos) OVER (PARTITION BY day_start)) as max_cursorpos FROM base_heartbeats ), quantized_heartbeats AS ( SELECT *, - ROUND(2 + (("time" - extract(epoch from day_start)) / 86400) * (396)) as qx, - ROUND(2 + (1 - CAST(lineno AS decimal) / max_lineno) * (96)) as qy_lineno, - ROUND(2 + (1 - CAST(cursorpos AS decimal) / max_cursorpos) * (96)) as qy_cursorpos + round(2 + ((time - toUInt32(toDateTime(day_start))) / 86400) * (396)) as qx, + round(2 + (1 - CAST(lineno AS Float64) / max_lineno) * (96)) as qy_lineno, + round(2 + (1 - CAST(cursorpos AS Float64) / max_cursorpos) * (96)) as qy_cursorpos FROM daily_stats ) - SELECT "time", lineno, cursorpos + SELECT time, lineno, cursorpos FROM ( - SELECT DISTINCT ON (day_start, qx, qy_lineno) "time", lineno, cursorpos + SELECT + any(time) AS time, + any(lineno) AS lineno, + any(cursorpos) AS cursorpos FROM quantized_heartbeats WHERE lineno IS NOT NULL - ORDER BY day_start, qx, qy_lineno, "time" ASC + GROUP BY day_start, qx, qy_lineno ) AS lineno_pixels - UNION - SELECT "time", lineno, cursorpos + UNION ALL + SELECT time, lineno, cursorpos FROM ( - SELECT DISTINCT ON (day_start, qx, qy_cursorpos) "time", lineno, cursorpos + SELECT + any(time) AS time, + any(lineno) AS lineno, + any(cursorpos) AS cursorpos FROM quantized_heartbeats WHERE cursorpos IS NOT NULL - ORDER BY day_start, qx, qy_cursorpos, "time" ASC + GROUP BY day_start, qx, qy_cursorpos ) AS cursorpos_pixels - ORDER BY "time" ASC + ORDER BY time ASC SQL daily_totals_query = <<-SQL WITH heartbeats_with_gaps AS ( SELECT - date_trunc('day', to_timestamp("time"))::date as day, - "time" - LAG("time", 1, "time") OVER (PARTITION BY date_trunc('day', to_timestamp("time")) ORDER BY "time") as gap + toDate(toDateTime(toUInt32(time))) as day, + time - lagInFrame(time, 1, time) OVER (PARTITION BY toDate(toDateTime(toUInt32(time))) ORDER BY time) as gap FROM heartbeats WHERE user_id = ? AND time >= ? AND time <= ? ) SELECT day, - SUM(LEAST(gap, 120)) as total_seconds + SUM(least(gap, 120)) as total_seconds FROM heartbeats_with_gaps WHERE gap IS NOT NULL GROUP BY day SQL - quantized_result = ActiveRecord::Base.connection.execute( - ActiveRecord::Base.sanitize_sql([ quantized_query, user.id, start_epoch, end_epoch ]) + quantized_result = Heartbeat.connection.select_all( + Heartbeat.sanitize_sql([ quantized_query, user.id, start_epoch, end_epoch ]) ) - daily_totals_result = ActiveRecord::Base.connection.execute( - ActiveRecord::Base.sanitize_sql([ daily_totals_query, user.id, start_epoch, end_epoch ]) + daily_totals_result = Heartbeat.connection.select_all( + Heartbeat.sanitize_sql([ daily_totals_query, user.id, start_epoch, end_epoch ]) ) daily_totals = daily_totals_result.each_with_object({}) do |row, hash| @@ -197,8 +203,8 @@ def alt_candidates LIMIT 5000 SQL - result = ActiveRecord::Base.connection.exec_query( - ActiveRecord::Base.sanitize_sql([ query, cutoff, cutoff ]) + result = Heartbeat.connection.select_all( + Heartbeat.sanitize_sql([ query, cutoff, cutoff ]) ) render json: { candidates: result.to_a } @@ -210,44 +216,20 @@ def shared_machines query = <<-SQL SELECT - sms.machine, - sms.machine_frequency, - ARRAY_AGG(DISTINCT u.id) AS user_ids - FROM - ( - SELECT - machine, - COUNT(user_id) AS machine_frequency, - ARRAY_AGG(user_id) AS user_ids - FROM - ( - SELECT DISTINCT - machine, - user_id - FROM - heartbeats - WHERE - machine IS NOT NULL - AND time > ? - ) AS UserMachines - GROUP BY - machine - HAVING - COUNT(user_id) > 1 - ) AS sms, - LATERAL UNNEST(sms.user_ids) AS user_id_from_array - JOIN - users AS u ON u.id = user_id_from_array - GROUP BY - sms.machine, - sms.machine_frequency - ORDER BY - sms.machine_frequency DESC + machine, + uniq(user_id) AS machine_frequency, + groupArray(DISTINCT user_id) AS user_ids + FROM heartbeats + WHERE machine != '' AND machine IS NOT NULL + AND time > ? + GROUP BY machine + HAVING uniq(user_id) > 1 + ORDER BY machine_frequency DESC LIMIT 5000 SQL - result = ActiveRecord::Base.connection.exec_query( - ActiveRecord::Base.sanitize_sql([ query, cutoff ]) + result = Heartbeat.connection.select_all( + Heartbeat.sanitize_sql([ query, cutoff ]) ) render json: { machines: result.to_a } diff --git a/app/controllers/api/hackatime/v1/hackatime_controller.rb b/app/controllers/api/hackatime/v1/hackatime_controller.rb index 3b1fed660..8a60d3b37 100644 --- a/app/controllers/api/hackatime/v1/hackatime_controller.rb +++ b/app/controllers/api/hackatime/v1/hackatime_controller.rb @@ -282,7 +282,7 @@ def handle_heartbeat(heartbeat_array) }).slice(*Heartbeat.column_names.map(&:to_sym)) # ^^ They say safety laws are written in blood. Well, so is this line! # Basically this filters out columns that aren't in our DB (the biggest one being raw_data) - new_heartbeat = Heartbeat.find_or_create_by(attrs) + new_heartbeat = Heartbeat.create(attrs) queue_project_mapping(heartbeat[:project]) results << [ new_heartbeat.attributes, 201 ] diff --git a/app/controllers/api/v1/authenticated/hours_controller.rb b/app/controllers/api/v1/authenticated/hours_controller.rb index d9348815f..2f51d9061 100644 --- a/app/controllers/api/v1/authenticated/hours_controller.rb +++ b/app/controllers/api/v1/authenticated/hours_controller.rb @@ -7,7 +7,7 @@ def index end_date = params[:end_date]&.to_date || Date.current total_seconds = current_user.heartbeats - .where(created_at: start_date.beginning_of_day..end_date.end_of_day) + .where(time: start_date.beginning_of_day.to_f..end_date.end_of_day.to_f) .duration_seconds render json: { diff --git a/app/controllers/concerns/api/admin/v1/user_utilities.rb b/app/controllers/concerns/api/admin/v1/user_utilities.rb index 807b42ca0..26803d792 100644 --- a/app/controllers/concerns/api/admin/v1/user_utilities.rb +++ b/app/controllers/concerns/api/admin/v1/user_utilities.rb @@ -179,7 +179,7 @@ def user_info total_coding_time: valid.duration_seconds || 0, languages_used: valid.distinct.pluck(:language).compact.count, projects_worked_on: valid.distinct.pluck(:project).compact.count, - days_active: valid.distinct.count("DATE(to_timestamp(CASE WHEN time > 1000000000000 THEN time / 1000 ELSE time END))") + days_active: Heartbeat.connection.select_value("SELECT uniq(toDate(toDateTime(toUInt32(time)))) FROM (#{valid.to_sql}) AS hb").to_i } } } diff --git a/app/jobs/cache/active_projects_job.rb b/app/jobs/cache/active_projects_job.rb index 59d175068..f63e280b9 100644 --- a/app/jobs/cache/active_projects_job.rb +++ b/app/jobs/cache/active_projects_job.rb @@ -8,14 +8,30 @@ def cache_expiration end def calculate - # Get recent heartbeats with matching project_repo_mappings in a single SQL query - ProjectRepoMapping.active - .joins("INNER JOIN heartbeats ON heartbeats.project = project_repo_mappings.project_name AND heartbeats.user_id = project_repo_mappings.user_id") - .joins("INNER JOIN users ON users.id = heartbeats.user_id") - .where("heartbeats.source_type = ?", Heartbeat.source_types[:direct_entry]) - .where("heartbeats.time > ?", 5.minutes.ago.to_f) - .select("DISTINCT ON (heartbeats.user_id) project_repo_mappings.*, heartbeats.user_id") - .order("heartbeats.user_id, heartbeats.time DESC") - .index_by(&:user_id) + # Query recent heartbeats from ClickHouse + recent_hbs = Heartbeat.where(source_type: Heartbeat.source_types[:direct_entry]) + .where("time > ?", 5.minutes.ago.to_f) + .order(time: :desc) + .to_a + + # Deduplicate by user_id (most recent heartbeat per user) + latest_by_user = recent_hbs.group_by(&:user_id).transform_values(&:first) + + return {} if latest_by_user.empty? + + # Find matching project_repo_mappings from Postgres + user_ids = latest_by_user.keys + + mappings = ProjectRepoMapping.active + .where(user_id: user_ids) + .to_a + + result = {} + latest_by_user.each do |user_id, hb| + mapping = mappings.find { |m| m.user_id == user_id && m.project_name == hb.project } + result[user_id] = mapping if mapping + end + + result end end diff --git a/app/jobs/cache/active_users_graph_data_job.rb b/app/jobs/cache/active_users_graph_data_job.rb index 9cbc380d7..f9a68417b 100644 --- a/app/jobs/cache/active_users_graph_data_job.rb +++ b/app/jobs/cache/active_users_graph_data_job.rb @@ -5,21 +5,24 @@ class Cache::ActiveUsersGraphDataJob < Cache::ActivityJob def calculate # over the last 24 hours, count the number of people who were active each hour - hours = Heartbeat.coding_only - .with_valid_timestamps - .where("time > ?", 24.hours.ago.to_f) - .where("time < ?", Time.current.to_f) - .select("(EXTRACT(EPOCH FROM to_timestamp(time))::bigint / 3600 * 3600) as hour, COUNT(DISTINCT user_id) as count") - .group("hour") - .order("hour DESC") + connection = Heartbeat.connection + hours = connection.select_all(<<~SQL + SELECT + toInt64(toUInt32(time) / 3600) * 3600 AS hour, + uniq(user_id) AS count + FROM (#{Heartbeat.coding_only.with_valid_timestamps.where("time > ?", 24.hours.ago.to_f).where("time < ?", Time.current.to_f).to_sql}) AS hb + GROUP BY hour + ORDER BY hour DESC + SQL + ) - top_hour_count = hours.max_by(&:count)&.count || 1 + top_hour_count = hours.map { |h| h["count"].to_i }.max || 1 hours = hours.map do |h| { - hour: Time.at(h.hour), - users: h.count, - height: (h.count.to_f / top_hour_count * 100).round + hour: Time.at(h["hour"].to_i), + users: h["count"].to_i, + height: (h["count"].to_f / top_hour_count * 100).round } end end diff --git a/app/jobs/cache/currently_hacking_count_job.rb b/app/jobs/cache/currently_hacking_count_job.rb index 6930d62e8..ff0120c60 100644 --- a/app/jobs/cache/currently_hacking_count_job.rb +++ b/app/jobs/cache/currently_hacking_count_job.rb @@ -8,12 +8,11 @@ def cache_expiration end def calculate - count = Heartbeat.joins(:user) - .where(source_type: :direct_entry) + count = Heartbeat.where(source_type: :direct_entry) .coding_only .where("time > ?", 5.minutes.ago.to_f) - .select("DISTINCT user_id") - .count + .distinct + .count(:user_id) { count: count } end diff --git a/app/jobs/cache/currently_hacking_job.rb b/app/jobs/cache/currently_hacking_job.rb index 236e2de3c..aa90fceab 100644 --- a/app/jobs/cache/currently_hacking_job.rb +++ b/app/jobs/cache/currently_hacking_job.rb @@ -8,17 +8,24 @@ def cache_expiration end def calculate - # Get most recent heartbeats and users in a single query - recent_heartbeats = Heartbeat.joins(:user) - .where(source_type: :direct_entry) - .coding_only - .where("time > ?", 5.minutes.ago.to_f) - .select("DISTINCT ON (user_id) user_id, project, time, users.*") - .order("user_id, time DESC") - .includes(user: [ :project_repo_mappings, :email_addresses ]) - .index_by(&:user_id) - - users = recent_heartbeats.values.map(&:user) + # Query ClickHouse for recent heartbeats (no cross-DB join) + raw_heartbeats = Heartbeat.where(source_type: :direct_entry) + .coding_only + .where("time > ?", 5.minutes.ago.to_f) + .order(time: :desc) + .to_a + + # Deduplicate by user_id (keep most recent) + recent_heartbeats = raw_heartbeats.group_by(&:user_id) + .transform_values(&:first) + + # Load users from Postgres + user_ids = recent_heartbeats.keys + users_by_id = User.where(id: user_ids) + .includes(:project_repo_mappings, :email_addresses) + .index_by(&:id) + + users = user_ids.filter_map { |uid| users_by_id[uid] } active_projects = {} users.each do |user| diff --git a/app/jobs/cache/home_stats_job.rb b/app/jobs/cache/home_stats_job.rb index 2626dae8c..6d2ef377e 100644 --- a/app/jobs/cache/home_stats_job.rb +++ b/app/jobs/cache/home_stats_job.rb @@ -4,10 +4,16 @@ class Cache::HomeStatsJob < Cache::ActivityJob private def calculate - seconds_by_user = Heartbeat.group(:user_id).duration_seconds + result = HeartbeatUserDailySummary.connection.select_one(<<~SQL) + SELECT + uniq(user_id) AS users_tracked, + toInt64(coalesce(sum(duration_s), 0)) AS seconds_tracked + FROM heartbeat_user_daily_summary FINAL + SQL + { - users_tracked: seconds_by_user.size, - seconds_tracked: seconds_by_user.values.sum + users_tracked: result["users_tracked"].to_i, + seconds_tracked: result["seconds_tracked"].to_i } end end diff --git a/app/jobs/leaderboard_update_job.rb b/app/jobs/leaderboard_update_job.rb index 39b94861a..4059f574a 100644 --- a/app/jobs/leaderboard_update_job.rb +++ b/app/jobs/leaderboard_update_job.rb @@ -31,13 +31,15 @@ def build_leaderboard(date, period, force_update = false) range = LeaderboardDateRange.calculate(date, period) ActiveRecord::Base.transaction do - # Build the base heartbeat query + # Get eligible user IDs from Postgres (can't cross-DB join) + eligible_user_ids = User.where.not(github_uid: nil) + .where.not(trust_level: User.trust_levels[:red]) + .pluck(:id) + heartbeat_query = Heartbeat.where(time: range) .with_valid_timestamps - .joins(:user) .coding_only - .where.not(users: { github_uid: nil }) - .where.not(users: { trust_level: User.trust_levels[:red] }) + .where(user_id: eligible_user_ids) data = heartbeat_query.group(:user_id).duration_seconds .filter { |_, seconds| seconds > 60 } diff --git a/app/jobs/one_time/recalc_heartbeat_field_hash_job.rb b/app/jobs/one_time/recalc_heartbeat_field_hash_job.rb index 7d4f334cb..4218e2ca7 100644 --- a/app/jobs/one_time/recalc_heartbeat_field_hash_job.rb +++ b/app/jobs/one_time/recalc_heartbeat_field_hash_job.rb @@ -8,7 +8,7 @@ def perform heartbeat.save! rescue ActiveRecord::RecordNotUnique # If we have a duplicate fields_hash, soft delete this record - heartbeat.soft_delete + heartbeat.delete end end end diff --git a/app/jobs/sync_all_user_repo_events_job.rb b/app/jobs/sync_all_user_repo_events_job.rb index 252611acf..131143fe2 100644 --- a/app/jobs/sync_all_user_repo_events_job.rb +++ b/app/jobs/sync_all_user_repo_events_job.rb @@ -10,11 +10,14 @@ def perform # Identify users: # 1. Authenticated with GitHub (have an access token and username) # 2. Have had heartbeats in the last 6 hours + # Get user IDs with recent heartbeats from ClickHouse (can't cross-DB join) + recent_user_ids = Heartbeat.where("time >= ?", 6.hours.ago.to_f) + .distinct + .pluck(:user_id) + users_to_sync = User.where.not(github_access_token: nil) .where.not(github_username: nil) - .joins(:heartbeats) # Assumes User has_many :heartbeats - .where("heartbeats.created_at >= ?", 6.hours.ago) - .distinct + .where(id: recent_user_ids) if users_to_sync.empty? Rails.logger.info "No users eligible for GitHub event sync at this time." diff --git a/app/jobs/weekly_summary_email_job.rb b/app/jobs/weekly_summary_email_job.rb index d4f7dbdec..acb365553 100644 --- a/app/jobs/weekly_summary_email_job.rb +++ b/app/jobs/weekly_summary_email_job.rb @@ -15,18 +15,13 @@ def perform(reference_time = Time.current) private def eligible_users(cutoff) - users = User.arel_table - heartbeats = Heartbeat.arel_table - - recent_activity_exists = Heartbeat.unscoped - .where(heartbeats[:user_id].eq(users[:id])) - .where(heartbeats[:deleted_at].eq(nil)) - .where(heartbeats[:time].gteq(cutoff.to_f)) - .arel - .exists + # Query ClickHouse for user_ids with recent activity (can't do cross-DB subquery) + active_user_ids = Heartbeat.where("time >= ?", cutoff.to_f) + .distinct + .pluck(:user_id) User.subscribed("weekly_summary").where( - users[:created_at].gteq(cutoff).or(recent_activity_exists) + User.arel_table[:created_at].gteq(cutoff).or(User.arel_table[:id].in(active_user_ids)) ).where.not(id: DeletionRequest.active.select(:user_id)) end end diff --git a/app/mailers/weekly_summary_mailer.rb b/app/mailers/weekly_summary_mailer.rb index 632652aff..9e22a3754 100644 --- a/app/mailers/weekly_summary_mailer.rb +++ b/app/mailers/weekly_summary_mailer.rb @@ -48,10 +48,12 @@ def breakdown(scope, column, limit: 5) def active_days_count(scope) timezone = @timezone_label - timezone_sql = ActiveRecord::Base.connection.quote(timezone) - scope.where.not(time: nil) - .distinct - .count(Arel.sql("DATE(to_timestamp(time) AT TIME ZONE #{timezone_sql})")) + # ClickHouse-compatible: count distinct days using toDate with timezone + result = Heartbeat.connection.select_value(<<~SQL) + SELECT uniq(toDate(toDateTime(toUInt32(time), '#{timezone}'))) + FROM (#{scope.where.not(time: nil).to_sql}) AS hb + SQL + result.to_i rescue StandardError scope.where.not(time: nil).pluck(:time).map { |time| Time.at(time).in_time_zone(timezone).to_date }.uniq.count end diff --git a/app/models/concerns/heartbeatable.rb b/app/models/concerns/heartbeatable.rb index b32f24af9..35039d195 100644 --- a/app/models/concerns/heartbeatable.rb +++ b/app/models/concerns/heartbeatable.rb @@ -2,11 +2,7 @@ module Heartbeatable extend ActiveSupport::Concern included do - # Filter heartbeats to only include those with category equal to "coding" scope :coding_only, -> { where(category: "coding") } - - # This is to prevent PG timestamp overflow errors if someones gives us a - # heartbeat with a time that is enormously far in the future. scope :with_valid_timestamps, -> { where("time >= 0 AND time <= ?", 253402300799) } end @@ -28,7 +24,7 @@ def to_span(timeout_duration: nil) sql = <<~SQL SELECT time, - LEAD(time) OVER (ORDER BY time) as next_time + leadInFrame(time) OVER (ORDER BY time ASC ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING) as next_time FROM (#{heartbeats.to_sql}) AS heartbeats SQL @@ -42,10 +38,10 @@ def to_span(timeout_duration: nil) current_time = row["time"] next_time = row["next_time"] - if next_time.nil? || (next_time - current_time) > timeout_duration + if next_time.nil? || next_time == 0 || (next_time - current_time) > timeout_duration base_duration = (current_time - current_span_start).round - if next_time + if next_time && next_time != 0 gap_duration = [ next_time - current_time, timeout_duration ].min total_duration = base_duration + gap_duration end_time = current_time + gap_duration @@ -62,7 +58,7 @@ def to_span(timeout_duration: nil) } end - current_span_start = next_time if next_time + current_span_start = next_time if next_time && next_time != 0 end end @@ -79,9 +75,6 @@ def duration_formatted(scope = all) end def duration_simple(scope = all) - # 3 hours 10 min => "3 hrs" - # 1 hour 10 min => "1 hr" - # 10 min => "10 min" seconds = duration_seconds(scope) hours = seconds / 3600 minutes = (seconds % 3600) / 60 @@ -109,70 +102,67 @@ def daily_streaks_for_users(user_ids, start_date: 31.days.ago) return user_ids.index_with { |id| streak_cache["user_streak_#{id}"] || 0 } end + # Fetch user timezones from Postgres + user_timezones = User.where(id: uncached_users).pluck(:id, :timezone).to_h + timeout = heartbeat_timeout_duration.to_i - raw_durations = joins(:user) - .where(user_id: uncached_users) - .coding_only - .with_valid_timestamps - .where(time: start_date..Time.current) - .select( - :user_id, - "users.timezone as user_timezone", - Arel.sql("DATE_TRUNC('day', to_timestamp(time) AT TIME ZONE users.timezone) as day_group"), - Arel.sql("LEAST(time - LAG(time) OVER (PARTITION BY user_id, DATE_TRUNC('day', to_timestamp(time) AT TIME ZONE users.timezone) ORDER BY time), #{timeout}) as diff") + + # Query heartbeats from ClickHouse (no cross-DB join) + raw_sql = <<~SQL + SELECT + user_id, + toDate(toDateTime(toUInt32(time))) AS day_group, + toInt64(coalesce(sum(diff), 0)) AS duration + FROM ( + SELECT + user_id, + time, + least( + time - lagInFrame(time) OVER (PARTITION BY user_id, toDate(toDateTime(toUInt32(time))) ORDER BY time ASC ROWS BETWEEN 1 PRECEDING AND CURRENT ROW), + #{timeout} + ) AS diff + FROM heartbeats + WHERE user_id IN (#{uncached_users.join(',')}) + AND category = 'coding' + AND time >= 0 AND time <= 253402300799 + AND time >= #{start_date.to_f} + AND time <= #{Time.current.to_f} ) + GROUP BY user_id, day_group + SQL - # Then aggregate the results - daily_durations = connection.select_all( - "SELECT user_id, user_timezone, day_group, COALESCE(SUM(diff), 0)::integer as duration - FROM (#{raw_durations.to_sql}) AS diffs - GROUP BY user_id, user_timezone, day_group" - ).group_by { |row| row["user_id"] } - .transform_values do |rows| - timezone = rows.first["user_timezone"] - - if timezone.blank? - Rails.logger.warn "nil tz, going to utc." - timezone = "UTC" - else - begin - TZInfo::Timezone.get(timezone) - rescue TZInfo::InvalidTimezoneIdentifier, ArgumentError - Rails.logger.warn "Invalid timezone for streak calculation: #{timezone}. Defaulting to UTC." - timezone = "UTC" - end - end - - current_date = Time.current.in_time_zone(timezone).to_date - { - current_date: current_date, - days: rows.map do |row| - [ row["day_group"].to_date, row["duration"].to_i ] - end.sort_by { |date, _| date }.reverse - } - end + rows = connection.select_all(raw_sql) + + daily_durations = rows.group_by { |row| row["user_id"].to_i } + .transform_values do |user_rows| + user_rows.map do |row| + [ row["day_group"].to_date, row["duration"].to_i ] + end.sort_by { |date, _| date }.reverse + end result = user_ids.index_with { |id| streak_cache["user_streak_#{id}"] || 0 } - # Then calculate streaks for each user - daily_durations.each do |user_id, data| - current_date = data[:current_date] - days = data[:days] + daily_durations.each do |user_id, days| + timezone = user_timezones[user_id] || "UTC" + + begin + TZInfo::Timezone.get(timezone) + rescue TZInfo::InvalidTimezoneIdentifier, ArgumentError + timezone = "UTC" + end + + current_date = Time.current.in_time_zone(timezone).to_date - # Calculate streak streak = 0 days.each do |date, duration| - # Skip if this day is in the future next if date > current_date - # If they didn't code enough today, just skip if date == current_date next unless duration >= 15 * 60 streak += 1 next end - # For previous days, check if it's the next day in the streak if date == current_date - streak.days && duration >= 15 * 60 streak += 1 else @@ -181,8 +171,6 @@ def daily_streaks_for_users(user_ids, start_date: 31.days.ago) end result[user_id] = streak - - # Cache the streak for 1 hour Rails.cache.write("user_streak_#{user_id}", streak, expires_in: 1.hour) end @@ -197,14 +185,24 @@ def daily_durations(user_timezone:, start_date: 365.days.ago, end_date: Time.cur timezone = "UTC" end - # Create the timezone-aware date truncation expression - day_trunc = Arel.sql("DATE_TRUNC('day', to_timestamp(time) AT TIME ZONE '#{timezone}')") + sql = <<~SQL + SELECT + toDate(toDateTime(toUInt32(time), '#{timezone}')) AS day_group, + toInt64(coalesce(sum(diff), 0)) AS duration + FROM ( + SELECT + time, + least( + time - lagInFrame(time) OVER (ORDER BY time ASC ROWS BETWEEN 1 PRECEDING AND CURRENT ROW), + #{heartbeat_timeout_duration.to_i} + ) AS diff + FROM (#{with_valid_timestamps.where(time: start_date.to_f..end_date.to_f).to_sql}) AS hb + ) + GROUP BY day_group + ORDER BY day_group + SQL - select(day_trunc.as("day_group")) - .where(time: start_date..end_date) - .group(day_trunc) - .duration_seconds - .map { |date, duration| [ date.to_date, duration ] } + connection.select_all(sql).map { |row| [ row["day_group"].to_date, row["duration"].to_i ] } end def duration_seconds(scope = all) @@ -217,35 +215,26 @@ def duration_seconds(scope = all) end group_column = scope.group_values.first + group_expr = group_column.to_s.include?("(") ? group_column : "`#{group_column}`" - # Don't quote if it's a SQL function (contains parentheses) - group_expr = group_column.to_s.include?("(") ? group_column : connection.quote_column_name(group_column) - - capped_diffs = scope - .select("#{group_expr} as grouped_time, CASE - WHEN LAG(time) OVER (PARTITION BY #{group_expr} ORDER BY time) IS NULL THEN 0 - ELSE LEAST(time - LAG(time) OVER (PARTITION BY #{group_expr} ORDER BY time), #{timeout}) - END as diff") + capped_diffs_sql = scope + .select("#{group_expr} as grouped_time, least(time - lagInFrame(time) OVER (PARTITION BY #{group_expr} ORDER BY time ASC ROWS BETWEEN 1 PRECEDING AND CURRENT ROW), #{timeout}) as diff") .where.not(time: nil) .unscope(:group) + .to_sql connection.select_all( - "SELECT grouped_time, COALESCE(SUM(diff), 0)::integer as duration - FROM (#{capped_diffs.to_sql}) AS diffs - GROUP BY grouped_time" + "SELECT grouped_time, toInt64(coalesce(sum(diff), 0)) as duration FROM (#{capped_diffs_sql}) GROUP BY grouped_time" ).each_with_object({}) do |row, hash| hash[row["grouped_time"]] = row["duration"].to_i end else - # when not grouped, return a single value - capped_diffs = scope - .select("CASE - WHEN LAG(time) OVER (ORDER BY time) IS NULL THEN 0 - ELSE LEAST(time - LAG(time) OVER (ORDER BY time), #{timeout}) - END as diff") + capped_diffs_sql = scope + .select("least(time - lagInFrame(time) OVER (ORDER BY time ASC ROWS BETWEEN 1 PRECEDING AND CURRENT ROW), #{timeout}) as diff") .where.not(time: nil) + .to_sql - connection.select_value("SELECT COALESCE(SUM(diff), 0)::integer FROM (#{capped_diffs.to_sql}) AS diffs").to_i + connection.select_value("SELECT toInt64(coalesce(sum(diff), 0)) FROM (#{capped_diffs_sql})").to_i end end @@ -256,7 +245,7 @@ def duration_seconds_boundary_aware(scope, start_time, end_time) base_scope = model_class.all.with_valid_timestamps excluded_categories = [ "browsing", "ai coding", "meeting", "communicating" ] - base_scope = base_scope.where.not("LOWER(category) IN (?)", excluded_categories) + base_scope = base_scope.where.not("lower(category) IN (?)", excluded_categories) if scope.where_values_hash["user_id"] base_scope = base_scope.where(user_id: scope.where_values_hash["user_id"]) @@ -270,18 +259,12 @@ def duration_seconds_boundary_aware(scope, start_time, end_time) base_scope = base_scope.where(project: scope.where_values_hash["project"]) end - if scope.where_values_hash["deleted_at"] - base_scope = base_scope.where(deleted_at: scope.where_values_hash["deleted_at"]) - end - - # get the heartbeat before the start_time boundary_heartbeat = base_scope .where("time < ?", start_time) .order(time: :desc) .limit(1) .first - # if it's not NULL, we'll use it if boundary_heartbeat combined_scope = base_scope .where("time >= ? OR time = ?", start_time, boundary_heartbeat.time) @@ -291,19 +274,14 @@ def duration_seconds_boundary_aware(scope, start_time, end_time) .where(time: start_time..end_time) end - # we calc w/ the boundary heartbeat, but we only sum within the orignal constraint timeout = heartbeat_timeout_duration.to_i - capped_diffs = combined_scope - .select("time, CASE - WHEN LAG(time) OVER (ORDER BY time) IS NULL THEN 0 - ELSE LEAST(time - LAG(time) OVER (ORDER BY time), #{timeout}) - END as diff") + capped_diffs_sql = combined_scope + .select("time, least(time - lagInFrame(time) OVER (ORDER BY time ASC ROWS BETWEEN 1 PRECEDING AND CURRENT ROW), #{timeout}) as diff") .where.not(time: nil) .order(time: :asc) + .to_sql - sql = "SELECT COALESCE(SUM(diff), 0)::integer - FROM (#{capped_diffs.to_sql}) AS diffs - WHERE time >= #{connection.quote(start_time)}" + sql = "SELECT toInt64(coalesce(sum(diff), 0)) FROM (#{capped_diffs_sql}) WHERE time >= #{connection.quote(start_time)}" connection.select_value(sql).to_i end end diff --git a/app/models/heartbeat.rb b/app/models/heartbeat.rb index 60b1372e4..53532794d 100644 --- a/app/models/heartbeat.rb +++ b/app/models/heartbeat.rb @@ -1,18 +1,13 @@ -class Heartbeat < ApplicationRecord - before_save :set_fields_hash! +class Heartbeat < ClickhouseRecord + self.table_name = "heartbeats" include Heartbeatable include TimeRangeFilterable time_range_filterable_field :time - # Default scope to exclude deleted records - default_scope { where(deleted_at: nil) } - scope :today, -> { where(time: Time.current.beginning_of_day.to_i..Time.current.end_of_day.to_i) } scope :recent, -> { where("time > ?", 24.hours.ago.to_i) } - scope :with_deleted, -> { unscope(where: :deleted_at) } - scope :only_deleted, -> { with_deleted.where.not(deleted_at: nil) } enum :source_type, { direct_entry: 0, @@ -82,15 +77,15 @@ class Heartbeat < ApplicationRecord neighborhood: 58 }, prefix: :claimed_by - # This is to prevent Rails from trying to use STI even though we have a "type" column + # Prevent Rails STI on the "type" column self.inheritance_column = nil + # Note: cross-database joins (Postgres users <-> ClickHouse heartbeats) will not work. + # Use separate queries instead of .joins(:heartbeats) or .includes(:heartbeats). belongs_to :user validates :time, presence: true - # after_create :mirror_to_wakatime - def self.recent_count Cache::HeartbeatCountsJob.perform_now[:recent_count] end @@ -109,24 +104,5 @@ def self.indexed_attributes %w[user_id branch category dependencies editor entity language machine operating_system project type user_agent line_additions line_deletions lineno lines cursorpos project_root_count time is_write] end - def soft_delete - update_column(:deleted_at, Time.current) - end - - def restore - update_column(:deleted_at, nil) - end - - private - - def set_fields_hash! - # only if the field exists in activerecord - if self.class.column_names.include?("fields_hash") - self.fields_hash = self.class.generate_fields_hash(self.attributes) - end - end - # def mirror_to_wakatime - # WakatimeMirror.mirror_heartbeat(self) - # end end diff --git a/app/models/user.rb b/app/models/user.rb index b165c8dfd..d1ee03dfd 100644 --- a/app/models/user.rb +++ b/app/models/user.rb @@ -108,6 +108,7 @@ def set_trust(level, changed_by_user: nil, reason: nil, notes: nil) end # ex: .set_trust(:green) or set_trust(1) setting it to red + # Heartbeats live in ClickHouse — simple scoped queries work but SQL JOINs will not. has_many :heartbeats has_many :goals, dependent: :destroy has_many :email_addresses, dependent: :destroy diff --git a/app/services/anonymize_user_service.rb b/app/services/anonymize_user_service.rb index 3d064821a..a1fac3f97 100644 --- a/app/services/anonymize_user_service.rb +++ b/app/services/anonymize_user_service.rb @@ -75,8 +75,8 @@ def destroy_associated_records user.project_repo_mappings.destroy_all user.goals.destroy_all - # tombstone - Heartbeat.unscoped.where(user_id: user.id, deleted_at: nil).update_all(deleted_at: Time.current) + # Delete heartbeats from ClickHouse + Heartbeat.where(user_id: user.id).delete_all user.access_grants.destroy_all user.access_tokens.destroy_all diff --git a/app/services/heartbeat_import_service.rb b/app/services/heartbeat_import_service.rb index 19a4db95d..5c56e15e6 100644 --- a/app/services/heartbeat_import_service.rb +++ b/app/services/heartbeat_import_service.rb @@ -108,8 +108,8 @@ def self.flush_batch(seen_hashes) end ActiveRecord::Base.logger.silence do - result = Heartbeat.upsert_all(records, unique_by: [ :fields_hash ]) - result.length + Heartbeat.insert_all(records) + records.length end end diff --git a/app/services/leaderboard_builder.rb b/app/services/leaderboard_builder.rb index 43001b682..a818a706b 100644 --- a/app/services/leaderboard_builder.rb +++ b/app/services/leaderboard_builder.rb @@ -17,11 +17,12 @@ def build_for_users(users, date, scope, period) range = LeaderboardDateRange.calculate(date, period) - beats = Heartbeat.where(user_id: ids, time: range) + # Filter to users with github_uid from Postgres (can't cross-DB join) + eligible_ids = users.where.not(github_uid: nil).pluck(:id) + + beats = Heartbeat.where(user_id: eligible_ids, time: range) .coding_only .with_valid_timestamps - .joins(:user) - .where.not(users: { github_uid: nil }) totals = beats.group(:user_id).duration_seconds totals = totals.filter { |_, seconds| seconds > 60 } diff --git a/app/services/profile_stats_service.rb b/app/services/profile_stats_service.rb index 157a25d8f..0b564bcc6 100644 --- a/app/services/profile_stats_service.rb +++ b/app/services/profile_stats_service.rb @@ -61,13 +61,12 @@ def compute_totals_and_breakdowns(timeout, today_start, today_end, week_start, w project, language, editor, - CASE - WHEN LAG(time) OVER (ORDER BY time) IS NULL THEN 0 - ELSE LEAST(time - LAG(time) OVER (ORDER BY time), #{timeout_quoted}) - END AS diff + least( + time - lagInFrame(time) OVER (ORDER BY time ASC ROWS BETWEEN 1 PRECEDING AND CURRENT ROW), + #{timeout_quoted} + ) AS diff FROM heartbeats WHERE user_id = #{user_id} - AND deleted_at IS NULL AND time IS NOT NULL AND time >= 0 AND time <= 253402300799 ) @@ -76,9 +75,9 @@ def compute_totals_and_breakdowns(timeout, today_start, today_end, week_start, w totals_sql = <<~SQL #{base_sql} SELECT - COALESCE(SUM(diff) FILTER (WHERE time >= #{today_start_quoted} AND time <= #{today_end_quoted}), 0)::integer AS today_seconds, - COALESCE(SUM(diff) FILTER (WHERE time >= #{week_start_quoted} AND time <= #{week_end_quoted}), 0)::integer AS week_seconds, - COALESCE(SUM(diff), 0)::integer AS all_seconds + toInt64(coalesce(sumIf(diff, time >= #{today_start_quoted} AND time <= #{today_end_quoted}), 0)) AS today_seconds, + toInt64(coalesce(sumIf(diff, time >= #{week_start_quoted} AND time <= #{week_end_quoted}), 0)) AS week_seconds, + toInt64(coalesce(sum(diff), 0)) AS all_seconds FROM heartbeat_diffs SQL @@ -104,7 +103,7 @@ def fetch_top_grouped(conn, base_sql, column, time_filter, limit) time_clause = time_filter ? "AND time >= #{time_filter}" : "" sql = <<~SQL #{base_sql} - SELECT #{column}, COALESCE(SUM(diff), 0)::integer AS duration + SELECT #{column}, toInt64(coalesce(sum(diff), 0)) AS duration FROM heartbeat_diffs WHERE #{column} IS NOT NULL AND #{column} != '' #{time_clause} @@ -121,7 +120,7 @@ def fetch_top_grouped(conn, base_sql, column, time_filter, limit) def fetch_top_grouped_with_repo(conn, base_sql, month_ago, limit) sql = <<~SQL #{base_sql} - SELECT project, COALESCE(SUM(diff), 0)::integer AS duration + SELECT project, toInt64(coalesce(sum(diff), 0)) AS duration FROM heartbeat_diffs WHERE time >= #{month_ago} AND project IS NOT NULL AND project != '' @@ -142,7 +141,7 @@ def fetch_top_grouped_with_repo(conn, base_sql, month_ago, limit) def fetch_top_editors_normalized(conn, base_sql, limit) sql = <<~SQL #{base_sql} - SELECT editor, COALESCE(SUM(diff), 0)::integer AS duration + SELECT editor, toInt64(coalesce(sum(diff), 0)) AS duration FROM heartbeat_diffs WHERE editor IS NOT NULL AND editor != '' GROUP BY editor diff --git a/app/services/timeline_service.rb b/app/services/timeline_service.rb index bb2e4da04..88eba929e 100644 --- a/app/services/timeline_service.rb +++ b/app/services/timeline_service.rb @@ -24,7 +24,7 @@ def timeline_data user_start_of_day = date.in_time_zone(user_tz).beginning_of_day.to_f user_end_of_day = date.in_time_zone(user_tz).end_of_day.to_f - total_coded_time_seconds = Heartbeat.where(user_id: user.id, deleted_at: nil) + total_coded_time_seconds = Heartbeat.where(user_id: user.id) .where("time >= ? AND time <= ?", user_start_of_day, user_end_of_day) .duration_seconds @@ -80,7 +80,7 @@ def heartbeats_by_user_id expanded_end = server_end_of_day + 24.hours.to_i Heartbeat - .where(user_id: valid_user_ids, deleted_at: nil) + .where(user_id: valid_user_ids) .where("time >= ? AND time <= ?", expanded_start, expanded_end) .select(:id, :user_id, :time, :entity, :project, :editor, :language) .order(:user_id, :time) diff --git a/config/database.yml b/config/database.yml index 95e82cdfc..088291770 100644 --- a/config/database.yml +++ b/config/database.yml @@ -20,6 +20,14 @@ development: encoding: unicode url: <%= ENV['SAILORS_LOG_DATABASE_URL'] %> replica: true + clickhouse: + adapter: clickhouse + host: <%= ENV.fetch('CLICKHOUSE_HOST', 'clickhouse') %> + port: <%= ENV.fetch('CLICKHOUSE_PORT', 8123) %> + database: <%= ENV.fetch('CLICKHOUSE_DATABASE', 'hackatime_development') %> + username: <%= ENV.fetch('CLICKHOUSE_USERNAME', 'default') %> + password: <%= ENV.fetch('CLICKHOUSE_PASSWORD', '') %> + migrations_paths: db/migrate_clickhouse # Warning: The database defined as "test" will be erased and # re-generated from your development database when you run "rake". @@ -35,6 +43,14 @@ test: database: app_test url: <%= ENV['TEST_DATABASE_URL'] %> replica: true + clickhouse: + adapter: clickhouse + host: <%= ENV.fetch('CLICKHOUSE_HOST', 'clickhouse') %> + port: <%= ENV.fetch('CLICKHOUSE_PORT', 8123) %> + database: <%= ENV.fetch('CLICKHOUSE_DATABASE', 'hackatime_test') %> + username: <%= ENV.fetch('CLICKHOUSE_USERNAME', 'default') %> + password: <%= ENV.fetch('CLICKHOUSE_PASSWORD', '') %> + migrations_paths: db/migrate_clickhouse # Store production database in the storage/ directory, which by default # is mounted as a persistent Docker volume in config/deploy.yml. @@ -60,3 +76,11 @@ production: adapter: sqlite3 database: storage/production_cable.sqlite3 migrations_paths: db/cable_migrate + clickhouse: + adapter: clickhouse + host: <%= ENV.fetch('CLICKHOUSE_HOST', 'clickhouse') %> + port: <%= ENV.fetch('CLICKHOUSE_PORT', 8123) %> + database: <%= ENV.fetch('CLICKHOUSE_DATABASE', 'hackatime_production') %> + username: <%= ENV.fetch('CLICKHOUSE_USERNAME', 'default') %> + password: <%= ENV.fetch('CLICKHOUSE_PASSWORD', '') %> + migrations_paths: db/migrate_clickhouse diff --git a/docker-compose.yml b/docker-compose.yml index d21461b40..058e08d77 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -17,8 +17,15 @@ services: - POSTGRES_USER=postgres - POSTGRES_PASSWORD=secureorpheus123 - TEST_DATABASE_URL=postgres://postgres:secureorpheus123@db:5432/app_test + - CLICKHOUSE_URL=http://clickhouse:8123 + - CLICKHOUSE_DATABASE=hackatime_development + - CLICKHOUSE_USERNAME=default + - CLICKHOUSE_PASSWORD= depends_on: - - db + db: + condition: service_started + clickhouse: + condition: service_healthy command: ["sleep", "infinity"] db: @@ -32,7 +39,24 @@ services: ports: - "5432:5432" + clickhouse: + image: clickhouse/clickhouse-server:24 + volumes: + - clickhouse_data:/var/lib/clickhouse + environment: + - CLICKHOUSE_DB=hackatime_development + - CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT=1 + ports: + - "8123:8123" + - "9000:9000" + healthcheck: + test: ["CMD", "clickhouse-client", "--query", "SELECT 1"] + interval: 5s + timeout: 3s + retries: 10 + volumes: harbor_postgres_data: + clickhouse_data: bundle_cache: node_modules: diff --git a/lib/tasks/seed_dummy_users.rake b/lib/tasks/seed_dummy_users.rake index 03f8c8266..4d5a73359 100644 --- a/lib/tasks/seed_dummy_users.rake +++ b/lib/tasks/seed_dummy_users.rake @@ -33,7 +33,7 @@ namespace :seed do ids = User.where("github_uid LIKE ?", "dummy_%").ids return puts "no dummies found (except for you)" if ids.empty? - Heartbeat.unscoped.where(user_id: ids).delete_all + Heartbeat.where(user_id: ids).delete_all LeaderboardEntry.where(user_id: ids).delete_all User.where(id: ids).delete_all puts "exploded #{ids.count} dummies" diff --git a/test/mailers/previews/weekly_summary_mailer_preview.rb b/test/mailers/previews/weekly_summary_mailer_preview.rb index 3c9b4626b..2d040e011 100644 --- a/test/mailers/previews/weekly_summary_mailer_preview.rb +++ b/test/mailers/previews/weekly_summary_mailer_preview.rb @@ -1,6 +1,8 @@ class WeeklySummaryMailerPreview < ActionMailer::Preview def weekly_summary - user = User.joins(:heartbeats).distinct.first || User.first + # Can't cross-DB join — find users with heartbeats separately + user_ids = Heartbeat.distinct.limit(1).pluck(:user_id) + user = (user_ids.any? ? User.find_by(id: user_ids.first) : nil) || User.first ends_at = Time.current.beginning_of_week starts_at = ends_at - 7.days diff --git a/test/models/heartbeat_test.rb b/test/models/heartbeat_test.rb index 3398ed337..3c949e881 100644 --- a/test/models/heartbeat_test.rb +++ b/test/models/heartbeat_test.rb @@ -1,7 +1,7 @@ require "test_helper" class HeartbeatTest < ActiveSupport::TestCase - test "soft delete hides record from default scope and restore brings it back" do + test "lightweight delete removes record from queries" do user = User.create! heartbeat = user.heartbeats.create!( entity: "src/main.rb", @@ -14,13 +14,9 @@ class HeartbeatTest < ActiveSupport::TestCase assert_includes Heartbeat.all, heartbeat - heartbeat.soft_delete + heartbeat.delete - assert_not_includes Heartbeat.all, heartbeat - assert_includes Heartbeat.with_deleted, heartbeat - - heartbeat.restore - - assert_includes Heartbeat.all, heartbeat + # After lightweight delete, the row is invisible to subsequent queries + assert_not_includes Heartbeat.where(user_id: user.id).to_a, heartbeat end end diff --git a/test/services/anonymize_user_service_test.rb b/test/services/anonymize_user_service_test.rb index 8c72c8a2a..84c8b45e1 100644 --- a/test/services/anonymize_user_service_test.rb +++ b/test/services/anonymize_user_service_test.rb @@ -52,9 +52,9 @@ class AnonymizeUserServiceTest < ActiveSupport::TestCase assert_equal 0, user.sign_in_tokens.count end - test "anonymization soft deletes active heartbeats" do + test "anonymization deletes heartbeats via lightweight delete" do user = User.create!(username: "hb_cleanup_#{SecureRandom.hex(4)}") - heartbeat = user.heartbeats.create!( + user.heartbeats.create!( entity: "src/app.rb", type: "file", category: "coding", @@ -65,7 +65,7 @@ class AnonymizeUserServiceTest < ActiveSupport::TestCase AnonymizeUserService.call(user) - assert heartbeat.reload.deleted_at.present? + assert_equal 0, Heartbeat.where(user_id: user.id).count end test "anonymization removes legacy encrypted import credentials" do From fb11941163709f2b444cc22ef477685d8e3f49b5 Mon Sep 17 00:00:00 2001 From: Mahad Kalam Date: Tue, 24 Mar 2026 10:21:05 +0000 Subject: [PATCH 02/24] clean --- app/models/clickhouse_record.rb | 4 ++ app/models/heartbeat.rb | 2 - app/models/heartbeat_user_daily_summary.rb | 3 ++ db/migrate_clickhouse/.keep | 0 .../20260324000001_create_heartbeats.rb | 44 +++++++++++++++++++ ...002_create_heartbeat_user_daily_summary.rb | 20 +++++++++ 6 files changed, 71 insertions(+), 2 deletions(-) create mode 100644 app/models/clickhouse_record.rb create mode 100644 app/models/heartbeat_user_daily_summary.rb create mode 100644 db/migrate_clickhouse/.keep create mode 100644 db/migrate_clickhouse/20260324000001_create_heartbeats.rb create mode 100644 db/migrate_clickhouse/20260324000002_create_heartbeat_user_daily_summary.rb diff --git a/app/models/clickhouse_record.rb b/app/models/clickhouse_record.rb new file mode 100644 index 000000000..f802a27e4 --- /dev/null +++ b/app/models/clickhouse_record.rb @@ -0,0 +1,4 @@ +class ClickhouseRecord < ActiveRecord::Base + self.abstract_class = true + connects_to database: { writing: :clickhouse, reading: :clickhouse } +end diff --git a/app/models/heartbeat.rb b/app/models/heartbeat.rb index 53532794d..0bcf5b034 100644 --- a/app/models/heartbeat.rb +++ b/app/models/heartbeat.rb @@ -103,6 +103,4 @@ def self.generate_fields_hash(attributes) def self.indexed_attributes %w[user_id branch category dependencies editor entity language machine operating_system project type user_agent line_additions line_deletions lineno lines cursorpos project_root_count time is_write] end - - end diff --git a/app/models/heartbeat_user_daily_summary.rb b/app/models/heartbeat_user_daily_summary.rb new file mode 100644 index 000000000..c60ebd42d --- /dev/null +++ b/app/models/heartbeat_user_daily_summary.rb @@ -0,0 +1,3 @@ +class HeartbeatUserDailySummary < ClickhouseRecord + self.table_name = "heartbeat_user_daily_summary" +end diff --git a/db/migrate_clickhouse/.keep b/db/migrate_clickhouse/.keep new file mode 100644 index 000000000..e69de29bb diff --git a/db/migrate_clickhouse/20260324000001_create_heartbeats.rb b/db/migrate_clickhouse/20260324000001_create_heartbeats.rb new file mode 100644 index 000000000..b2b0b59ef --- /dev/null +++ b/db/migrate_clickhouse/20260324000001_create_heartbeats.rb @@ -0,0 +1,44 @@ +class CreateHeartbeats < ActiveRecord::Migration[8.1] + def up + execute <<~SQL + CREATE TABLE IF NOT EXISTS heartbeats + ( + id Int64, + user_id Int64, + branch String DEFAULT '', + category String DEFAULT '', + dependencies Array(String), + editor String DEFAULT '', + entity String DEFAULT '', + language String DEFAULT '', + machine String DEFAULT '', + operating_system String DEFAULT '', + project String DEFAULT '', + type String DEFAULT '', + user_agent String DEFAULT '', + line_additions Int32 DEFAULT 0, + line_deletions Int32 DEFAULT 0, + lineno Int32 DEFAULT 0, + lines Int32 DEFAULT 0, + cursorpos Int32 DEFAULT 0, + project_root_count Int32 DEFAULT 0, + time Float64, + is_write UInt8 DEFAULT 0, + created_at DateTime64(6) DEFAULT now64(), + updated_at DateTime64(6) DEFAULT now64(), + source_type UInt8 DEFAULT 0, + ip_address String DEFAULT '', + ysws_program UInt8 DEFAULT 0, + fields_hash String DEFAULT '' + ) + ENGINE = ReplacingMergeTree() + PARTITION BY toYYYYMM(toDateTime(toUInt32(time))) + ORDER BY (user_id, toDate(toDateTime(toUInt32(time))), project, id) + SETTINGS index_granularity = 8192 + SQL + end + + def down + execute "DROP TABLE IF EXISTS heartbeats" + end +end diff --git a/db/migrate_clickhouse/20260324000002_create_heartbeat_user_daily_summary.rb b/db/migrate_clickhouse/20260324000002_create_heartbeat_user_daily_summary.rb new file mode 100644 index 000000000..ff257bed9 --- /dev/null +++ b/db/migrate_clickhouse/20260324000002_create_heartbeat_user_daily_summary.rb @@ -0,0 +1,20 @@ +class CreateHeartbeatUserDailySummary < ActiveRecord::Migration[8.1] + def up + execute <<~SQL + CREATE TABLE IF NOT EXISTS heartbeat_user_daily_summary + ( + user_id Int64, + day Date, + duration_s Float64, + heartbeats UInt32, + _version DateTime DEFAULT now() + ) + ENGINE = ReplacingMergeTree(_version) + ORDER BY (user_id, day) + SQL + end + + def down + execute "DROP TABLE IF EXISTS heartbeat_user_daily_summary" + end +end From ea5b7d4f3b00c0706231aef6ce8be900ed27fe1b Mon Sep 17 00:00:00 2001 From: Mahad Kalam Date: Tue, 24 Mar 2026 14:23:13 +0000 Subject: [PATCH 03/24] Address Greptile comments --- .env.example | 3 +- Gemfile.lock | 4 ++ .../api/admin/v1/admin_controller.rb | 8 +-- .../api/hackatime/v1/hackatime_controller.rb | 5 +- app/mailers/weekly_summary_mailer.rb | 3 +- app/models/concerns/heartbeatable.rb | 52 +++++++++--------- app/services/heartbeat_import_service.rb | 6 +- config/brakeman.ignore | 54 ++++++++++++++++-- db/clickhouse_schema.rb | 55 +++++++++++++++++++ 9 files changed, 151 insertions(+), 39 deletions(-) create mode 100644 db/clickhouse_schema.rb diff --git a/.env.example b/.env.example index 575c01fe1..6ac978177 100644 --- a/.env.example +++ b/.env.example @@ -62,7 +62,8 @@ S3_BUCKET=your_s3_bucket_name_here S3_ENDPOINT=https://.r2.cloudflarestorage.com # ClickHouse database -CLICKHOUSE_URL=http://clickhouse:8123 +CLICKHOUSE_HOST=clickhouse +CLICKHOUSE_PORT=8123 CLICKHOUSE_DATABASE=hackatime_development CLICKHOUSE_USERNAME=default CLICKHOUSE_PASSWORD= diff --git a/Gemfile.lock b/Gemfile.lock index 34249d07a..951045426 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -131,6 +131,9 @@ GEM childprocess (5.1.0) logger (~> 1.5) chunky_png (1.4.0) + clickhouse-activerecord (1.6.7) + activerecord (>= 7.1, < 9.0) + bundler (>= 1.13.4) cloudflare-rails (7.0.0) actionpack (>= 7.2.0, < 8.2.0) activesupport (>= 7.2.0, < 8.2.0) @@ -652,6 +655,7 @@ DEPENDENCIES brakeman bullet capybara + clickhouse-activerecord cloudflare-rails countries debug diff --git a/app/controllers/api/admin/v1/admin_controller.rb b/app/controllers/api/admin/v1/admin_controller.rb index 73e855e9d..e39ca48f3 100644 --- a/app/controllers/api/admin/v1/admin_controller.rb +++ b/app/controllers/api/admin/v1/admin_controller.rb @@ -58,7 +58,7 @@ def visualization_quantized FROM heartbeats WHERE user_id = ? AND time >= ? AND time <= ? - AND (lineno IS NOT NULL OR cursorpos IS NOT NULL) + AND (lineno > 0 OR cursorpos > 0) LIMIT 1000000 ), daily_stats AS ( @@ -83,7 +83,7 @@ def visualization_quantized any(lineno) AS lineno, any(cursorpos) AS cursorpos FROM quantized_heartbeats - WHERE lineno IS NOT NULL + WHERE lineno > 0 GROUP BY day_start, qx, qy_lineno ) AS lineno_pixels UNION ALL @@ -94,7 +94,7 @@ def visualization_quantized any(lineno) AS lineno, any(cursorpos) AS cursorpos FROM quantized_heartbeats - WHERE cursorpos IS NOT NULL + WHERE cursorpos > 0 GROUP BY day_start, qx, qy_cursorpos ) AS cursorpos_pixels ORDER BY time ASC @@ -218,7 +218,7 @@ def shared_machines SELECT machine, uniq(user_id) AS machine_frequency, - groupArray(DISTINCT user_id) AS user_ids + groupUniqArray(user_id) AS user_ids FROM heartbeats WHERE machine != '' AND machine IS NOT NULL AND time > ? diff --git a/app/controllers/api/hackatime/v1/hackatime_controller.rb b/app/controllers/api/hackatime/v1/hackatime_controller.rb index 8a60d3b37..97bb5520b 100644 --- a/app/controllers/api/hackatime/v1/hackatime_controller.rb +++ b/app/controllers/api/hackatime/v1/hackatime_controller.rb @@ -282,7 +282,10 @@ def handle_heartbeat(heartbeat_array) }).slice(*Heartbeat.column_names.map(&:to_sym)) # ^^ They say safety laws are written in blood. Well, so is this line! # Basically this filters out columns that aren't in our DB (the biggest one being raw_data) - new_heartbeat = Heartbeat.create(attrs) + fields_hash = Heartbeat.generate_fields_hash(attrs) + new_heartbeat = Heartbeat.find_or_create_by(fields_hash: fields_hash) do |hb| + hb.assign_attributes(attrs) + end queue_project_mapping(heartbeat[:project]) results << [ new_heartbeat.attributes, 201 ] diff --git a/app/mailers/weekly_summary_mailer.rb b/app/mailers/weekly_summary_mailer.rb index 9e22a3754..d4891ed98 100644 --- a/app/mailers/weekly_summary_mailer.rb +++ b/app/mailers/weekly_summary_mailer.rb @@ -48,9 +48,10 @@ def breakdown(scope, column, limit: 5) def active_days_count(scope) timezone = @timezone_label + tz_quoted = Heartbeat.connection.quote(timezone) # ClickHouse-compatible: count distinct days using toDate with timezone result = Heartbeat.connection.select_value(<<~SQL) - SELECT uniq(toDate(toDateTime(toUInt32(time), '#{timezone}'))) + SELECT uniq(toDate(toDateTime(toUInt32(time), #{tz_quoted}))) FROM (#{scope.where.not(time: nil).to_sql}) AS hb SQL result.to_i diff --git a/app/models/concerns/heartbeatable.rb b/app/models/concerns/heartbeatable.rb index 35039d195..60b0dce4d 100644 --- a/app/models/concerns/heartbeatable.rb +++ b/app/models/concerns/heartbeatable.rb @@ -107,37 +107,37 @@ def daily_streaks_for_users(user_ids, start_date: 31.days.ago) timeout = heartbeat_timeout_duration.to_i - # Query heartbeats from ClickHouse (no cross-DB join) + # Query raw heartbeat diffs from ClickHouse, then group by day per user's timezone in Ruby raw_sql = <<~SQL SELECT user_id, - toDate(toDateTime(toUInt32(time))) AS day_group, - toInt64(coalesce(sum(diff), 0)) AS duration - FROM ( - SELECT - user_id, - time, - least( - time - lagInFrame(time) OVER (PARTITION BY user_id, toDate(toDateTime(toUInt32(time))) ORDER BY time ASC ROWS BETWEEN 1 PRECEDING AND CURRENT ROW), - #{timeout} - ) AS diff - FROM heartbeats - WHERE user_id IN (#{uncached_users.join(',')}) - AND category = 'coding' - AND time >= 0 AND time <= 253402300799 - AND time >= #{start_date.to_f} - AND time <= #{Time.current.to_f} - ) - GROUP BY user_id, day_group + time, + least( + time - lagInFrame(time, 1, time) OVER (PARTITION BY user_id ORDER BY time ASC ROWS BETWEEN 1 PRECEDING AND CURRENT ROW), + #{timeout} + ) AS diff + FROM heartbeats + WHERE user_id IN (#{uncached_users.join(',')}) + AND category = 'coding' + AND time >= 0 AND time <= 253402300799 + AND time >= #{start_date.to_f} + AND time <= #{Time.current.to_f} SQL rows = connection.select_all(raw_sql) daily_durations = rows.group_by { |row| row["user_id"].to_i } .transform_values do |user_rows| - user_rows.map do |row| - [ row["day_group"].to_date, row["duration"].to_i ] - end.sort_by { |date, _| date }.reverse + user_id = user_rows.first["user_id"].to_i + tz = user_timezones[user_id] || "UTC" + begin + TZInfo::Timezone.get(tz) + rescue TZInfo::InvalidTimezoneIdentifier, ArgumentError + tz = "UTC" + end + user_rows.group_by { |row| Time.at(row["time"].to_f).in_time_zone(tz).to_date } + .map { |date, day_rows| [ date, day_rows.sum { |r| r["diff"].to_i } ] } + .sort_by { |date, _| date }.reverse end result = user_ids.index_with { |id| streak_cache["user_streak_#{id}"] || 0 } @@ -193,7 +193,7 @@ def daily_durations(user_timezone:, start_date: 365.days.ago, end_date: Time.cur SELECT time, least( - time - lagInFrame(time) OVER (ORDER BY time ASC ROWS BETWEEN 1 PRECEDING AND CURRENT ROW), + time - lagInFrame(time, 1, time) OVER (ORDER BY time ASC ROWS BETWEEN 1 PRECEDING AND CURRENT ROW), #{heartbeat_timeout_duration.to_i} ) AS diff FROM (#{with_valid_timestamps.where(time: start_date.to_f..end_date.to_f).to_sql}) AS hb @@ -218,7 +218,7 @@ def duration_seconds(scope = all) group_expr = group_column.to_s.include?("(") ? group_column : "`#{group_column}`" capped_diffs_sql = scope - .select("#{group_expr} as grouped_time, least(time - lagInFrame(time) OVER (PARTITION BY #{group_expr} ORDER BY time ASC ROWS BETWEEN 1 PRECEDING AND CURRENT ROW), #{timeout}) as diff") + .select("#{group_expr} as grouped_time, least(time - lagInFrame(time, 1, time) OVER (PARTITION BY #{group_expr} ORDER BY time ASC ROWS BETWEEN 1 PRECEDING AND CURRENT ROW), #{timeout}) as diff") .where.not(time: nil) .unscope(:group) .to_sql @@ -230,7 +230,7 @@ def duration_seconds(scope = all) end else capped_diffs_sql = scope - .select("least(time - lagInFrame(time) OVER (ORDER BY time ASC ROWS BETWEEN 1 PRECEDING AND CURRENT ROW), #{timeout}) as diff") + .select("least(time - lagInFrame(time, 1, time) OVER (ORDER BY time ASC ROWS BETWEEN 1 PRECEDING AND CURRENT ROW), #{timeout}) as diff") .where.not(time: nil) .to_sql @@ -276,7 +276,7 @@ def duration_seconds_boundary_aware(scope, start_time, end_time) timeout = heartbeat_timeout_duration.to_i capped_diffs_sql = combined_scope - .select("time, least(time - lagInFrame(time) OVER (ORDER BY time ASC ROWS BETWEEN 1 PRECEDING AND CURRENT ROW), #{timeout}) as diff") + .select("time, least(time - lagInFrame(time, 1, time) OVER (ORDER BY time ASC ROWS BETWEEN 1 PRECEDING AND CURRENT ROW), #{timeout}) as diff") .where.not(time: nil) .order(time: :asc) .to_sql diff --git a/app/services/heartbeat_import_service.rb b/app/services/heartbeat_import_service.rb index 5c56e15e6..00eb3a721 100644 --- a/app/services/heartbeat_import_service.rb +++ b/app/services/heartbeat_import_service.rb @@ -108,8 +108,10 @@ def self.flush_batch(seen_hashes) end ActiveRecord::Base.logger.silence do - Heartbeat.insert_all(records) - records.length + existing_hashes = Heartbeat.where(fields_hash: records.map { |r| r[:fields_hash] }).pluck(:fields_hash).to_set + new_records = records.reject { |r| existing_hashes.include?(r[:fields_hash]) } + Heartbeat.insert_all(new_records) if new_records.any? + new_records.length end end diff --git a/config/brakeman.ignore b/config/brakeman.ignore index 39cb1a651..7a93639f1 100644 --- a/config/brakeman.ignore +++ b/config/brakeman.ignore @@ -3,13 +3,59 @@ { "warning_type": "SQL Injection", "warning_code": 0, - "fingerprint": "06ca3650eaeb8d28e062c1c6dcbeab95fb2ccd0c5bb49165ad469bcb6b791d3e", + "fingerprint": "8bfb27cde848a37932bdcee60ced12ea578880374edf0f4b12da7b2b6fc32269", + "check_name": "SQL", + "message": "Possible SQL injection", + "file": "app/mailers/weekly_summary_mailer.rb", + "line": 53, + "link": "https://brakemanscanner.org/docs/warning_types/sql_injection/", + "code": "Heartbeat.connection.select_value(\"SELECT uniq(toDate(toDateTime(toUInt32(time), '#{@timezone_label}')))\nFROM (#{scope.where.not(:time => nil).to_sql}) AS hb\n\")", + "render_path": null, + "location": { + "type": "method", + "class": "WeeklySummaryMailer", + "method": "active_days_count" + }, + "user_input": "@timezone_label", + "confidence": "Medium", + "cwe_id": [ + 89 + ], + "note": "Timezone is validated via ActiveSupport::TimeZone lookup" + }, + { + "warning_type": "SQL Injection", + "warning_code": 0, + "fingerprint": "9b9294560765ae07af31fffc430808dc95ab2be8514b051e6d8fe06b49488832", "check_name": "SQL", "message": "Possible SQL injection", "file": "app/models/concerns/heartbeatable.rb", - "line": 175, + "line": 134, "link": "https://brakemanscanner.org/docs/warning_types/sql_injection/", - "code": "Arel.sql(\"DATE_TRUNC('day', to_timestamp(time) AT TIME ZONE '#{(user_timezone or \"UTC\")}')\")", + "code": "connection.select_all(\"SELECT\\n user_id,\\n toDate(toDateTime(toUInt32(time))) AS day_group,\\n toInt64(coalesce(sum(diff), 0)) AS duration\\nFROM (\\n SELECT\\n user_id,\\n time,\\n least(\\n time - lagInFrame(time) OVER (PARTITION BY user_id, toDate(toDateTime(toUInt32(time))) ORDER BY time ASC ROWS BETWEEN 1 PRECEDING AND CURRENT ROW),\\n #{heartbeat_timeout_duration.to_i}\\n ) AS diff\\n FROM heartbeats\\n WHERE user_id IN (#{user_ids.select do\\n Rails.cache.read_multi(*user_ids.map do\\n \\\"user_streak_#{id}\\\"\\n end)[\\\"user_streak_#{id}\\\"].nil?\\n end.join(\\\",\\\")})\\n AND category = 'coding'\\n AND time >= 0 AND time <= 253402300799\\n AND time >= #{[start_date, 30.days.ago].max.to_f}\\n AND time <= #{Time.current.to_f}\\n)\\nGROUP BY user_id, day_group\\n\")", + "render_path": null, + "location": { + "type": "method", + "class": "Heartbeatable", + "method": "daily_streaks_for_users" + }, + "user_input": "user_ids.select do\n Rails.cache.read_multi(*user_ids.map do\n \"user_streak_#{id}\"\n end)[\"user_streak_#{id}\"].nil?\n end.join(\",\")", + "confidence": "Medium", + "cwe_id": [ + 89 + ], + "note": "user_ids are integer IDs from internal queries, not user input" + }, + { + "warning_type": "SQL Injection", + "warning_code": 0, + "fingerprint": "c3d4e314e4dbd66500b55b2a0edc8b337ec85f907eb151248ba5c109e679c7fb", + "check_name": "SQL", + "message": "Possible SQL injection", + "file": "app/models/concerns/heartbeatable.rb", + "line": 205, + "link": "https://brakemanscanner.org/docs/warning_types/sql_injection/", + "code": "connection.select_all(\"SELECT\\n toDate(toDateTime(toUInt32(time), '#{(user_timezone or \\\"UTC\\\")}')) AS day_group,\\n toInt64(coalesce(sum(diff), 0)) AS duration\\nFROM (\\n SELECT\\n time,\\n least(\\n time - lagInFrame(time) OVER (ORDER BY time ASC ROWS BETWEEN 1 PRECEDING AND CURRENT ROW),\\n #{heartbeat_timeout_duration.to_i}\\n ) AS diff\\n FROM (#{with_valid_timestamps.where(:time => ((start_date.to_f..end_date.to_f))).to_sql}) AS hb\\n)\\nGROUP BY day_group\\nORDER BY day_group\\n\")", "render_path": null, "location": { "type": "method", @@ -21,7 +67,7 @@ "cwe_id": [ 89 ], - "note": "" + "note": "Timezone is validated against TZInfo::Timezone.all_identifiers" }, { "warning_type": "SQL Injection", diff --git a/db/clickhouse_schema.rb b/db/clickhouse_schema.rb new file mode 100644 index 000000000..471dbe3b4 --- /dev/null +++ b/db/clickhouse_schema.rb @@ -0,0 +1,55 @@ +# This file is auto-generated from the current state of the database. Instead +# of editing this file, please use the migrations feature of Active Record to +# incrementally modify your database, and then regenerate this schema definition. +# +# This file is the source Rails uses to define your schema when running `bin/rails +# db:schema:load`. When creating a new database, `bin/rails db:schema:load` tends to +# be faster and is potentially less error prone than running all of your +# migrations from scratch. Old migrations may fail to apply correctly if those +# migrations use external dependencies or application code. +# +# It's strongly recommended that you check this file into your version control system. + +ActiveRecord::Schema[8.1].define(version: 2026_03_24_000002) do + # TABLE: heartbeat_user_daily_summary + # SQL: CREATE TABLE heartbeat_user_daily_summary ( `user_id` Int64, `day` Date, `duration_s` Float64, `heartbeats` UInt32, `_version` DateTime DEFAULT now() ) ENGINE = ReplacingMergeTree(_version) ORDER BY (user_id, day) SETTINGS index_granularity = 8192 + create_table "heartbeat_user_daily_summary", id: false, options: "ReplacingMergeTree(_version) ORDER BY (user_id, day) SETTINGS index_granularity = 8192", force: :cascade do |t| + t.integer "user_id", unsigned: false, limit: 8, null: false + t.date "day", null: false + t.float "duration_s", null: false + t.integer "heartbeats", null: false + t.datetime "_version", precision: nil, default: -> { "now()" }, null: false + end + + # TABLE: heartbeats + # SQL: CREATE TABLE heartbeats ( `id` Int64, `user_id` Int64, `branch` String DEFAULT '', `category` String DEFAULT '', `dependencies` Array(String), `editor` String DEFAULT '', `entity` String DEFAULT '', `language` String DEFAULT '', `machine` String DEFAULT '', `operating_system` String DEFAULT '', `project` String DEFAULT '', `type` String DEFAULT '', `user_agent` String DEFAULT '', `line_additions` Int32 DEFAULT 0, `line_deletions` Int32 DEFAULT 0, `lineno` Int32 DEFAULT 0, `lines` Int32 DEFAULT 0, `cursorpos` Int32 DEFAULT 0, `project_root_count` Int32 DEFAULT 0, `time` Float64, `is_write` UInt8 DEFAULT 0, `created_at` DateTime64(6) DEFAULT now64(), `updated_at` DateTime64(6) DEFAULT now64(), `source_type` UInt8 DEFAULT 0, `ip_address` String DEFAULT '', `ysws_program` UInt8 DEFAULT 0, `fields_hash` String DEFAULT '' ) ENGINE = ReplacingMergeTree PARTITION BY toYYYYMM(toDateTime(toUInt32(time))) ORDER BY (user_id, toDate(toDateTime(toUInt32(time))), project, id) SETTINGS index_granularity = 8192 + create_table "heartbeats", id: :int64, options: "ReplacingMergeTree PARTITION BY toYYYYMM(toDateTime(toUInt32(time))) ORDER BY (user_id, toDate(toDateTime(toUInt32(time))), project, id) SETTINGS index_granularity = 8192", force: :cascade do |t| + t.integer "id", unsigned: false, limit: 8, null: false + t.integer "user_id", unsigned: false, limit: 8, null: false + t.string "branch", default: "", null: false + t.string "category", default: "", null: false + t.string "dependencies", array: true, null: false + t.string "editor", default: "", null: false + t.string "entity", default: "", null: false + t.string "language", default: "", null: false + t.string "machine", default: "", null: false + t.string "operating_system", default: "", null: false + t.string "project", default: "", null: false + t.string "type", default: "", null: false + t.string "user_agent", default: "", null: false + t.integer "line_additions", unsigned: false, default: 0, null: false + t.integer "line_deletions", unsigned: false, default: 0, null: false + t.integer "lineno", unsigned: false, default: 0, null: false + t.integer "lines", unsigned: false, default: 0, null: false + t.integer "cursorpos", unsigned: false, default: 0, null: false + t.integer "project_root_count", unsigned: false, default: 0, null: false + t.float "time", null: false + t.integer "is_write", limit: 1, default: 0, null: false + t.datetime "created_at", default: -> { "now64()" }, null: false + t.datetime "updated_at", default: -> { "now64()" }, null: false + t.integer "source_type", limit: 1, default: 0, null: false + t.string "ip_address", default: "", null: false + t.integer "ysws_program", limit: 1, default: 0, null: false + t.string "fields_hash", default: "", null: false + end +end From 9d96ea5a1329874505de0747b1ecb5137b062373 Mon Sep 17 00:00:00 2001 From: Mahad Kalam Date: Tue, 24 Mar 2026 14:28:01 +0000 Subject: [PATCH 04/24] Fix the CI? --- .github/workflows/ci.yml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7147caf12..7f7d25432 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -113,6 +113,11 @@ jobs: POSTGRES_USER: postgres POSTGRES_PASSWORD: postgres POSTGRES_DB: app_test + clickhouse: + image: clickhouse/clickhouse-server:latest + ports: + - 8123:8123 + options: --health-cmd "wget --no-verbose --tries=1 --spider http://localhost:8123/ping || exit 1" --health-interval 10s --health-timeout 5s --health-retries 5 steps: - name: Checkout code @@ -130,6 +135,7 @@ jobs: TEST_DATABASE_URL: postgres://postgres:postgres@localhost:5432/app_test PGHOST: localhost PGUSER: postgres + CLICKHOUSE_HOST: localhost PGPASSWORD: postgres run: | bin/rails db:create RAILS_ENV=test @@ -142,6 +148,7 @@ jobs: TEST_DATABASE_URL: postgres://postgres:postgres@localhost:5432/app_test PGHOST: localhost PGUSER: postgres + CLICKHOUSE_HOST: localhost PGPASSWORD: postgres run: | bin/rails rswag:specs:swaggerize @@ -160,6 +167,11 @@ jobs: POSTGRES_USER: postgres POSTGRES_PASSWORD: postgres POSTGRES_DB: app_test + clickhouse: + image: clickhouse/clickhouse-server:latest + ports: + - 8123:8123 + options: --health-cmd "wget --no-verbose --tries=1 --spider http://localhost:8123/ping || exit 1" --health-interval 10s --health-timeout 5s --health-retries 5 steps: - name: Checkout code @@ -200,6 +212,7 @@ jobs: TEST_DATABASE_URL: postgres://postgres:postgres@localhost:5432/app_test PGHOST: localhost PGUSER: postgres + CLICKHOUSE_HOST: localhost PGPASSWORD: postgres CHROME_BIN: ${{ steps.setup-chrome.outputs.chrome-path }} CHROMEDRIVER_BIN: ${{ steps.setup-chrome.outputs.chromedriver-path }} From 57efaeb96fea3a864e3a816b51e3f7ee1f38fc52 Mon Sep 17 00:00:00 2001 From: Mahad Kalam Date: Tue, 24 Mar 2026 14:30:52 +0000 Subject: [PATCH 05/24] Goog --- .github/workflows/ci.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7f7d25432..1c9e69b8f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -117,6 +117,8 @@ jobs: image: clickhouse/clickhouse-server:latest ports: - 8123:8123 + env: + CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT: 1 options: --health-cmd "wget --no-verbose --tries=1 --spider http://localhost:8123/ping || exit 1" --health-interval 10s --health-timeout 5s --health-retries 5 steps: @@ -171,6 +173,8 @@ jobs: image: clickhouse/clickhouse-server:latest ports: - 8123:8123 + env: + CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT: 1 options: --health-cmd "wget --no-verbose --tries=1 --spider http://localhost:8123/ping || exit 1" --health-interval 10s --health-timeout 5s --health-retries 5 steps: From c6ffded5d251f7c89e30326afe313c21db9bb0b8 Mon Sep 17 00:00:00 2001 From: Mahad Kalam Date: Tue, 24 Mar 2026 14:41:10 +0000 Subject: [PATCH 06/24] goog --- .../api/hackatime/v1/hackatime_controller.rb | 49 ++++++++++++++----- app/jobs/aggregate_user_daily_summary_job.rb | 30 ++++++++++++ app/models/heartbeat.rb | 6 +++ app/services/profile_stats_service.rb | 2 +- config/initializers/good_job.rb | 5 ++ 5 files changed, 78 insertions(+), 14 deletions(-) create mode 100644 app/jobs/aggregate_user_daily_summary_job.rb diff --git a/app/controllers/api/hackatime/v1/hackatime_controller.rb b/app/controllers/api/hackatime/v1/hackatime_controller.rb index 97bb5520b..2ba3128dd 100644 --- a/app/controllers/api/hackatime/v1/hackatime_controller.rb +++ b/app/controllers/api/hackatime/v1/hackatime_controller.rb @@ -234,7 +234,41 @@ def body_to_json def handle_heartbeat(heartbeat_array) results = [] last_language = nil - heartbeat_array.each do |heartbeat| + prepared = prepare_heartbeat_attrs(heartbeat_array) + all_hashes = prepared.map { |p| p[:fields_hash] }.compact + existing_hashes = if all_hashes.any? + Heartbeat.where(fields_hash: all_hashes).pluck(:fields_hash).to_set + else + Set.new + end + + prepared.each do |item| + heartbeat = item[:attrs] + fields_hash = item[:fields_hash] + source_type = item[:source_type] + + if existing_hashes.include?(fields_hash) + existing = Heartbeat.where(fields_hash: fields_hash).first + queue_project_mapping(heartbeat[:project]) + results << [ existing.attributes, 201 ] + else + new_heartbeat = Heartbeat.create!(heartbeat.merge(fields_hash: fields_hash)) + existing_hashes.add(fields_hash) + queue_project_mapping(heartbeat[:project]) + results << [ new_heartbeat.attributes, 201 ] + end + rescue => e + report_error(e, message: "Error creating heartbeat") + results << [ { error: e.message, type: e.class.name }, 422 ] + end + + PosthogService.capture_once_per_day(@user, "heartbeat_sent", { heartbeat_count: heartbeat_array.size }) + results + end + + def prepare_heartbeat_attrs(heartbeat_array) + last_language = nil + heartbeat_array.filter_map do |heartbeat| heartbeat = heartbeat.to_h.with_indifferent_access source_type = :direct_entry @@ -283,19 +317,8 @@ def handle_heartbeat(heartbeat_array) # ^^ They say safety laws are written in blood. Well, so is this line! # Basically this filters out columns that aren't in our DB (the biggest one being raw_data) fields_hash = Heartbeat.generate_fields_hash(attrs) - new_heartbeat = Heartbeat.find_or_create_by(fields_hash: fields_hash) do |hb| - hb.assign_attributes(attrs) - end - - queue_project_mapping(heartbeat[:project]) - results << [ new_heartbeat.attributes, 201 ] - rescue => e - report_error(e, message: "Error creating heartbeat") - results << [ { error: e.message, type: e.class.name }, 422 ] + { attrs: attrs, fields_hash: fields_hash, source_type: source_type } end - - PosthogService.capture_once_per_day(@user, "heartbeat_sent", { heartbeat_count: heartbeat_array.size }) - results end def queue_project_mapping(project_name) diff --git a/app/jobs/aggregate_user_daily_summary_job.rb b/app/jobs/aggregate_user_daily_summary_job.rb new file mode 100644 index 000000000..072c0f608 --- /dev/null +++ b/app/jobs/aggregate_user_daily_summary_job.rb @@ -0,0 +1,30 @@ +class AggregateUserDailySummaryJob < ApplicationJob + include GoodJob::ActiveJobExtensions::Concurrency + + queue_as :latency_5m + + good_job_control_concurrency_with( + total: 1, + drop: true + ) + + def perform + conn = HeartbeatUserDailySummary.connection + + conn.execute(<<~SQL) + INSERT INTO heartbeat_user_daily_summary (user_id, day, duration_s, heartbeats) + SELECT + user_id, + toDate(toDateTime(toUInt32(time))) AS day, + sum( + least( + time - lagInFrame(time, 1, time) OVER (PARTITION BY user_id, toDate(toDateTime(toUInt32(time))) ORDER BY time ASC ROWS BETWEEN 1 PRECEDING AND CURRENT ROW), + 120 + ) + ) AS duration_s, + count() AS heartbeats + FROM heartbeats + GROUP BY user_id, day + SQL + end +end diff --git a/app/models/heartbeat.rb b/app/models/heartbeat.rb index 0bcf5b034..5c16aee4c 100644 --- a/app/models/heartbeat.rb +++ b/app/models/heartbeat.rb @@ -6,6 +6,12 @@ class Heartbeat < ClickhouseRecord time_range_filterable_field :time + before_create :set_fields_hash!, if: -> { fields_hash.blank? } + + def set_fields_hash! + self.fields_hash = self.class.generate_fields_hash(attributes) + end + scope :today, -> { where(time: Time.current.beginning_of_day.to_i..Time.current.end_of_day.to_i) } scope :recent, -> { where("time > ?", 24.hours.ago.to_i) } diff --git a/app/services/profile_stats_service.rb b/app/services/profile_stats_service.rb index 0b564bcc6..f98cb5fd1 100644 --- a/app/services/profile_stats_service.rb +++ b/app/services/profile_stats_service.rb @@ -62,7 +62,7 @@ def compute_totals_and_breakdowns(timeout, today_start, today_end, week_start, w language, editor, least( - time - lagInFrame(time) OVER (ORDER BY time ASC ROWS BETWEEN 1 PRECEDING AND CURRENT ROW), + time - lagInFrame(time, 1, time) OVER (ORDER BY time ASC ROWS BETWEEN 1 PRECEDING AND CURRENT ROW), #{timeout_quoted} ) AS diff FROM heartbeats diff --git a/config/initializers/good_job.rb b/config/initializers/good_job.rb index c42d57b9b..2a1d98c40 100644 --- a/config/initializers/good_job.rb +++ b/config/initializers/good_job.rb @@ -79,6 +79,11 @@ class: "Cache::CurrentlyHackingJob", kwargs: { force_reload: true } }, + aggregate_user_daily_summary: { + cron: "*/10 * * * *", + class: "AggregateUserDailySummaryJob", + description: "Aggregates heartbeats into per-user daily summaries for home stats." + }, cache_home_stats: { cron: "*/10 * * * *", class: "Cache::HomeStatsJob", From 638e3bab714c5686aafd41cb039b3a3e0e0c0534 Mon Sep 17 00:00:00 2001 From: Mahad Kalam Date: Tue, 24 Mar 2026 14:49:05 +0000 Subject: [PATCH 07/24] goog! --- test/system/heartbeat_export_test.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/system/heartbeat_export_test.rb b/test/system/heartbeat_export_test.rb index 8f79a86d8..3b33b91cf 100644 --- a/test/system/heartbeat_export_test.rb +++ b/test/system/heartbeat_export_test.rb @@ -1,7 +1,7 @@ require "application_system_test_case" class HeartbeatExportTest < ApplicationSystemTestCase - fixtures :users, :email_addresses, :heartbeats, :sign_in_tokens, :api_keys, :admin_api_keys + fixtures :users, :email_addresses, :sign_in_tokens, :api_keys, :admin_api_keys setup do GoodJob::Job.delete_all From ac65647d225db521e2e56ed6109743dfd2c67d0c Mon Sep 17 00:00:00 2001 From: Mahad Kalam Date: Tue, 24 Mar 2026 14:52:16 +0000 Subject: [PATCH 08/24] Goog :sob: --- test/services/heartbeat_import_dump_client_test.rb | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/services/heartbeat_import_dump_client_test.rb b/test/services/heartbeat_import_dump_client_test.rb index 4aec68d75..cd125f0ed 100644 --- a/test/services/heartbeat_import_dump_client_test.rb +++ b/test/services/heartbeat_import_dump_client_test.rb @@ -2,6 +2,10 @@ require "webmock/minitest" class HeartbeatImportDumpClientTest < ActiveSupport::TestCase + setup do + WebMock.disable_net_connect!(allow_localhost: true) + end + test "valid_wakatime_download_url? only accepts wakatime s3 links over https" do assert HeartbeatImportDumpClient.valid_wakatime_download_url?("https://wakatime.s3.amazonaws.com/export.json?signature=abc") assert_not HeartbeatImportDumpClient.valid_wakatime_download_url?("http://wakatime.s3.amazonaws.com/export.json") From f5c3f422ce3d0d8445a4c274bb6b2ad87badaa81 Mon Sep 17 00:00:00 2001 From: Mahad Kalam Date: Tue, 24 Mar 2026 15:38:04 +0000 Subject: [PATCH 09/24] Fixes? --- app/models/heartbeat.rb | 10 ++ config/brakeman.ignore | 49 ++----- config/database.yml | 2 +- db/clickhouse_schema.rb | 125 +++++++++++++----- .../heartbeat_import_dump_client_test.rb | 5 - test/test_helper.rb | 22 +++ 6 files changed, 135 insertions(+), 78 deletions(-) diff --git a/app/models/heartbeat.rb b/app/models/heartbeat.rb index 5c16aee4c..6083927eb 100644 --- a/app/models/heartbeat.rb +++ b/app/models/heartbeat.rb @@ -1,17 +1,27 @@ class Heartbeat < ClickhouseRecord self.table_name = "heartbeats" + self.primary_key = :id include Heartbeatable include TimeRangeFilterable time_range_filterable_field :time + before_create :set_clickhouse_id!, if: -> { self[:id].blank? } before_create :set_fields_hash!, if: -> { fields_hash.blank? } + CLICKHOUSE_ID_RANDOM_BITS = 10 + CLICKHOUSE_ID_RANDOM_MAX = 1 << CLICKHOUSE_ID_RANDOM_BITS + def set_fields_hash! self.fields_hash = self.class.generate_fields_hash(attributes) end + def set_clickhouse_id! + timestamp_us = (Time.current.to_r * 1_000_000).to_i + self[:id] = (timestamp_us << CLICKHOUSE_ID_RANDOM_BITS) | SecureRandom.random_number(CLICKHOUSE_ID_RANDOM_MAX) + end + scope :today, -> { where(time: Time.current.beginning_of_day.to_i..Time.current.end_of_day.to_i) } scope :recent, -> { where("time > ?", 24.hours.ago.to_i) } diff --git a/config/brakeman.ignore b/config/brakeman.ignore index 7a93639f1..cd6e65f28 100644 --- a/config/brakeman.ignore +++ b/config/brakeman.ignore @@ -3,71 +3,48 @@ { "warning_type": "SQL Injection", "warning_code": 0, - "fingerprint": "8bfb27cde848a37932bdcee60ced12ea578880374edf0f4b12da7b2b6fc32269", - "check_name": "SQL", - "message": "Possible SQL injection", - "file": "app/mailers/weekly_summary_mailer.rb", - "line": 53, - "link": "https://brakemanscanner.org/docs/warning_types/sql_injection/", - "code": "Heartbeat.connection.select_value(\"SELECT uniq(toDate(toDateTime(toUInt32(time), '#{@timezone_label}')))\nFROM (#{scope.where.not(:time => nil).to_sql}) AS hb\n\")", - "render_path": null, - "location": { - "type": "method", - "class": "WeeklySummaryMailer", - "method": "active_days_count" - }, - "user_input": "@timezone_label", - "confidence": "Medium", - "cwe_id": [ - 89 - ], - "note": "Timezone is validated via ActiveSupport::TimeZone lookup" - }, - { - "warning_type": "SQL Injection", - "warning_code": 0, - "fingerprint": "9b9294560765ae07af31fffc430808dc95ab2be8514b051e6d8fe06b49488832", + "fingerprint": "4cffceb1f5c7d23d28b9846734a5a7333e1005874e310d65b39a32a5221990f9", "check_name": "SQL", "message": "Possible SQL injection", "file": "app/models/concerns/heartbeatable.rb", - "line": 134, + "line": 127, "link": "https://brakemanscanner.org/docs/warning_types/sql_injection/", - "code": "connection.select_all(\"SELECT\\n user_id,\\n toDate(toDateTime(toUInt32(time))) AS day_group,\\n toInt64(coalesce(sum(diff), 0)) AS duration\\nFROM (\\n SELECT\\n user_id,\\n time,\\n least(\\n time - lagInFrame(time) OVER (PARTITION BY user_id, toDate(toDateTime(toUInt32(time))) ORDER BY time ASC ROWS BETWEEN 1 PRECEDING AND CURRENT ROW),\\n #{heartbeat_timeout_duration.to_i}\\n ) AS diff\\n FROM heartbeats\\n WHERE user_id IN (#{user_ids.select do\\n Rails.cache.read_multi(*user_ids.map do\\n \\\"user_streak_#{id}\\\"\\n end)[\\\"user_streak_#{id}\\\"].nil?\\n end.join(\\\",\\\")})\\n AND category = 'coding'\\n AND time >= 0 AND time <= 253402300799\\n AND time >= #{[start_date, 30.days.ago].max.to_f}\\n AND time <= #{Time.current.to_f}\\n)\\nGROUP BY user_id, day_group\\n\")", + "code": null, "render_path": null, "location": { "type": "method", "class": "Heartbeatable", "method": "daily_streaks_for_users" }, - "user_input": "user_ids.select do\n Rails.cache.read_multi(*user_ids.map do\n \"user_streak_#{id}\"\n end)[\"user_streak_#{id}\"].nil?\n end.join(\",\")", + "user_input": null, "confidence": "Medium", "cwe_id": [ 89 ], - "note": "user_ids are integer IDs from internal queries, not user input" + "note": "user_ids are integer IDs from internal queries, timeout and dates use .to_i/.to_f" }, { "warning_type": "SQL Injection", "warning_code": 0, - "fingerprint": "c3d4e314e4dbd66500b55b2a0edc8b337ec85f907eb151248ba5c109e679c7fb", + "fingerprint": "076acac8c26cca89c2cbbcf0bad1bf4a45067b33c331aac087fdb8f4326590ff", "check_name": "SQL", "message": "Possible SQL injection", "file": "app/models/concerns/heartbeatable.rb", "line": 205, "link": "https://brakemanscanner.org/docs/warning_types/sql_injection/", - "code": "connection.select_all(\"SELECT\\n toDate(toDateTime(toUInt32(time), '#{(user_timezone or \\\"UTC\\\")}')) AS day_group,\\n toInt64(coalesce(sum(diff), 0)) AS duration\\nFROM (\\n SELECT\\n time,\\n least(\\n time - lagInFrame(time) OVER (ORDER BY time ASC ROWS BETWEEN 1 PRECEDING AND CURRENT ROW),\\n #{heartbeat_timeout_duration.to_i}\\n ) AS diff\\n FROM (#{with_valid_timestamps.where(:time => ((start_date.to_f..end_date.to_f))).to_sql}) AS hb\\n)\\nGROUP BY day_group\\nORDER BY day_group\\n\")", + "code": null, "render_path": null, "location": { "type": "method", "class": "Heartbeatable", "method": "daily_durations" }, - "user_input": "user_timezone", + "user_input": null, "confidence": "Medium", "cwe_id": [ 89 ], - "note": "Timezone is validated against TZInfo::Timezone.all_identifiers" + "note": "Timezone is validated against TZInfo::Timezone.all_identifiers, timeout uses .to_i" }, { "warning_type": "SQL Injection", @@ -78,20 +55,20 @@ "file": "app/jobs/one_time/generate_unique_heartbeat_hashes_job.rb", "line": 32, "link": "https://brakemanscanner.org/docs/warning_types/sql_injection/", - "code": "Heartbeat.where(:id => chunk.map do\n index = 1\nputs(\"Processing heartbeat #{heartbeat.id} (#{1} of #{batch.size})\")\nfield_hash = Heartbeat.generate_fields_hash(heartbeat.attributes)\nputs(\"Field hash: #{Heartbeat.generate_fields_hash(heartbeat.attributes)}\")\n[heartbeat.id, Heartbeat.generate_fields_hash(heartbeat.attributes)]\n end.map(&:first)).update_all(\"fields_hash = CASE #{chunk.map do\n index = 1\nputs(\"Processing heartbeat #{heartbeat.id} (#{1} of #{batch.size})\")\nfield_hash = Heartbeat.generate_fields_hash(heartbeat.attributes)\nputs(\"Field hash: #{Heartbeat.generate_fields_hash(heartbeat.attributes)}\")\n[heartbeat.id, Heartbeat.generate_fields_hash(heartbeat.attributes)]\n end.map do\n \"WHEN id = #{id} THEN '#{hash}'\"\n end.join(\" \")} END\")", + "code": null, "render_path": null, "location": { "type": "method", "class": "OneTime::GenerateUniqueHeartbeatHashesJob", "method": "perform" }, - "user_input": "Heartbeat.generate_fields_hash(heartbeat.attributes)", + "user_input": null, "confidence": "High", "cwe_id": [ 89 ], - "note": "" + "note": "One-time migration job using internally generated hashes" } ], - "brakeman_version": "7.0.2" + "brakeman_version": "8.0.4" } diff --git a/config/database.yml b/config/database.yml index 088291770..e16e2d49f 100644 --- a/config/database.yml +++ b/config/database.yml @@ -47,7 +47,7 @@ test: adapter: clickhouse host: <%= ENV.fetch('CLICKHOUSE_HOST', 'clickhouse') %> port: <%= ENV.fetch('CLICKHOUSE_PORT', 8123) %> - database: <%= ENV.fetch('CLICKHOUSE_DATABASE', 'hackatime_test') %> + database: hackatime_test username: <%= ENV.fetch('CLICKHOUSE_USERNAME', 'default') %> password: <%= ENV.fetch('CLICKHOUSE_PASSWORD', '') %> migrations_paths: db/migrate_clickhouse diff --git a/db/clickhouse_schema.rb b/db/clickhouse_schema.rb index 471dbe3b4..38a7727ad 100644 --- a/db/clickhouse_schema.rb +++ b/db/clickhouse_schema.rb @@ -11,45 +11,98 @@ # It's strongly recommended that you check this file into your version control system. ActiveRecord::Schema[8.1].define(version: 2026_03_24_000002) do + # TABLE: schema_migrations + # SQL: CREATE TABLE schema_migrations ( `version` String, `active` Int8 DEFAULT 1, `ver` DateTime DEFAULT now() ) ENGINE = ReplacingMergeTree(ver) ORDER BY (version) + execute <<~SQL + DROP TABLE IF EXISTS schema_migrations + SQL + execute <<~SQL + CREATE TABLE schema_migrations + ( + `version` String, + `active` Int8 DEFAULT 1, + `ver` DateTime DEFAULT now() + ) + ENGINE = ReplacingMergeTree(ver) + ORDER BY (version) + SQL + + # TABLE: ar_internal_metadata + # SQL: CREATE TABLE ar_internal_metadata ( `key` String, `value` String, `created_at` DateTime, `updated_at` DateTime ) ENGINE = ReplacingMergeTree(created_at) PARTITION BY key ORDER BY key + execute <<~SQL + DROP TABLE IF EXISTS ar_internal_metadata + SQL + execute <<~SQL + CREATE TABLE ar_internal_metadata + ( + `key` String, + `value` String, + `created_at` DateTime, + `updated_at` DateTime + ) + ENGINE = ReplacingMergeTree(created_at) + PARTITION BY key + ORDER BY key + SQL + # TABLE: heartbeat_user_daily_summary # SQL: CREATE TABLE heartbeat_user_daily_summary ( `user_id` Int64, `day` Date, `duration_s` Float64, `heartbeats` UInt32, `_version` DateTime DEFAULT now() ) ENGINE = ReplacingMergeTree(_version) ORDER BY (user_id, day) SETTINGS index_granularity = 8192 - create_table "heartbeat_user_daily_summary", id: false, options: "ReplacingMergeTree(_version) ORDER BY (user_id, day) SETTINGS index_granularity = 8192", force: :cascade do |t| - t.integer "user_id", unsigned: false, limit: 8, null: false - t.date "day", null: false - t.float "duration_s", null: false - t.integer "heartbeats", null: false - t.datetime "_version", precision: nil, default: -> { "now()" }, null: false - end + execute <<~SQL + DROP TABLE IF EXISTS heartbeat_user_daily_summary + SQL + execute <<~SQL + CREATE TABLE heartbeat_user_daily_summary + ( + `user_id` Int64, + `day` Date, + `duration_s` Float64, + `heartbeats` UInt32, + `_version` DateTime DEFAULT now() + ) + ENGINE = ReplacingMergeTree(_version) + ORDER BY (user_id, day) + SETTINGS index_granularity = 8192 + SQL # TABLE: heartbeats # SQL: CREATE TABLE heartbeats ( `id` Int64, `user_id` Int64, `branch` String DEFAULT '', `category` String DEFAULT '', `dependencies` Array(String), `editor` String DEFAULT '', `entity` String DEFAULT '', `language` String DEFAULT '', `machine` String DEFAULT '', `operating_system` String DEFAULT '', `project` String DEFAULT '', `type` String DEFAULT '', `user_agent` String DEFAULT '', `line_additions` Int32 DEFAULT 0, `line_deletions` Int32 DEFAULT 0, `lineno` Int32 DEFAULT 0, `lines` Int32 DEFAULT 0, `cursorpos` Int32 DEFAULT 0, `project_root_count` Int32 DEFAULT 0, `time` Float64, `is_write` UInt8 DEFAULT 0, `created_at` DateTime64(6) DEFAULT now64(), `updated_at` DateTime64(6) DEFAULT now64(), `source_type` UInt8 DEFAULT 0, `ip_address` String DEFAULT '', `ysws_program` UInt8 DEFAULT 0, `fields_hash` String DEFAULT '' ) ENGINE = ReplacingMergeTree PARTITION BY toYYYYMM(toDateTime(toUInt32(time))) ORDER BY (user_id, toDate(toDateTime(toUInt32(time))), project, id) SETTINGS index_granularity = 8192 - create_table "heartbeats", id: :int64, options: "ReplacingMergeTree PARTITION BY toYYYYMM(toDateTime(toUInt32(time))) ORDER BY (user_id, toDate(toDateTime(toUInt32(time))), project, id) SETTINGS index_granularity = 8192", force: :cascade do |t| - t.integer "id", unsigned: false, limit: 8, null: false - t.integer "user_id", unsigned: false, limit: 8, null: false - t.string "branch", default: "", null: false - t.string "category", default: "", null: false - t.string "dependencies", array: true, null: false - t.string "editor", default: "", null: false - t.string "entity", default: "", null: false - t.string "language", default: "", null: false - t.string "machine", default: "", null: false - t.string "operating_system", default: "", null: false - t.string "project", default: "", null: false - t.string "type", default: "", null: false - t.string "user_agent", default: "", null: false - t.integer "line_additions", unsigned: false, default: 0, null: false - t.integer "line_deletions", unsigned: false, default: 0, null: false - t.integer "lineno", unsigned: false, default: 0, null: false - t.integer "lines", unsigned: false, default: 0, null: false - t.integer "cursorpos", unsigned: false, default: 0, null: false - t.integer "project_root_count", unsigned: false, default: 0, null: false - t.float "time", null: false - t.integer "is_write", limit: 1, default: 0, null: false - t.datetime "created_at", default: -> { "now64()" }, null: false - t.datetime "updated_at", default: -> { "now64()" }, null: false - t.integer "source_type", limit: 1, default: 0, null: false - t.string "ip_address", default: "", null: false - t.integer "ysws_program", limit: 1, default: 0, null: false - t.string "fields_hash", default: "", null: false - end + execute <<~SQL + DROP TABLE IF EXISTS heartbeats + SQL + execute <<~SQL + CREATE TABLE heartbeats + ( + `id` Int64, + `user_id` Int64, + `branch` String DEFAULT '', + `category` String DEFAULT '', + `dependencies` Array(String), + `editor` String DEFAULT '', + `entity` String DEFAULT '', + `language` String DEFAULT '', + `machine` String DEFAULT '', + `operating_system` String DEFAULT '', + `project` String DEFAULT '', + `type` String DEFAULT '', + `user_agent` String DEFAULT '', + `line_additions` Int32 DEFAULT 0, + `line_deletions` Int32 DEFAULT 0, + `lineno` Int32 DEFAULT 0, + `lines` Int32 DEFAULT 0, + `cursorpos` Int32 DEFAULT 0, + `project_root_count` Int32 DEFAULT 0, + `time` Float64, + `is_write` UInt8 DEFAULT 0, + `created_at` DateTime64(6) DEFAULT now64(), + `updated_at` DateTime64(6) DEFAULT now64(), + `source_type` UInt8 DEFAULT 0, + `ip_address` String DEFAULT '', + `ysws_program` UInt8 DEFAULT 0, + `fields_hash` String DEFAULT '' + ) + ENGINE = ReplacingMergeTree + PARTITION BY toYYYYMM(toDateTime(toUInt32(time))) + ORDER BY (user_id, toDate(toDateTime(toUInt32(time))), project, id) + SETTINGS index_granularity = 8192 + SQL end diff --git a/test/services/heartbeat_import_dump_client_test.rb b/test/services/heartbeat_import_dump_client_test.rb index cd125f0ed..94e69da18 100644 --- a/test/services/heartbeat_import_dump_client_test.rb +++ b/test/services/heartbeat_import_dump_client_test.rb @@ -1,11 +1,6 @@ require "test_helper" -require "webmock/minitest" class HeartbeatImportDumpClientTest < ActiveSupport::TestCase - setup do - WebMock.disable_net_connect!(allow_localhost: true) - end - test "valid_wakatime_download_url? only accepts wakatime s3 links over https" do assert HeartbeatImportDumpClient.valid_wakatime_download_url?("https://wakatime.s3.amazonaws.com/export.json?signature=abc") assert_not HeartbeatImportDumpClient.valid_wakatime_download_url?("http://wakatime.s3.amazonaws.com/export.json") diff --git a/test/test_helper.rb b/test/test_helper.rb index 75a9bc40a..0344aea4a 100644 --- a/test/test_helper.rb +++ b/test/test_helper.rb @@ -3,6 +3,10 @@ require "rails/test_help" require "nokogiri" require "json" +require "webmock/minitest" + +# ClickHouse uses HTTP in test, so allow that host while keeping other external requests blocked. +WebMock.disable_net_connect!(allow_localhost: true, allow: "clickhouse") module ActiveSupport class TestCase @@ -28,6 +32,24 @@ class TestCase ] # Add more helper methods to be used by all tests here... + private + + def before_setup + truncate_clickhouse_tables + super + end + + def after_teardown + truncate_clickhouse_tables + super + end + + def truncate_clickhouse_tables + Heartbeat.connection.execute("TRUNCATE TABLE #{Heartbeat.connection.quote_table_name(Heartbeat.table_name)}") + HeartbeatUserDailySummary.connection.execute( + "TRUNCATE TABLE #{HeartbeatUserDailySummary.connection.quote_table_name(HeartbeatUserDailySummary.table_name)}" + ) + end end end From c36a064f4d5cad728038de48f603c7b4b15b0dc2 Mon Sep 17 00:00:00 2001 From: Mahad Kalam Date: Tue, 24 Mar 2026 15:41:38 +0000 Subject: [PATCH 10/24] Fix test_system CI: remove ClickHouse heartbeat fixtures, guard Slack API call --- app/models/concerns/slack_integration.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/app/models/concerns/slack_integration.rb b/app/models/concerns/slack_integration.rb index c6b868a99..6790e7292 100644 --- a/app/models/concerns/slack_integration.rb +++ b/app/models/concerns/slack_integration.rb @@ -45,6 +45,7 @@ def update_from_slack def update_slack_status return unless uses_slack_status? + return if slack_access_token.blank? current_status_response = HTTP.auth("Bearer #{slack_access_token}") .get("https://slack.com/api/users.profile.get") From 96925f7baaf6462bca16316bb09d96279277b2ec Mon Sep 17 00:00:00 2001 From: Mahad Kalam Date: Tue, 24 Mar 2026 16:03:41 +0000 Subject: [PATCH 11/24] Goog? --- app/javascript/layouts/AppLayout.svelte | 17 ++++--- app/views/layouts/application.html.erb | 8 +++- config/initializers/git_version.rb | 21 +++------ lib/git_metadata.rb | 27 +++++++++++ test/lib/git_metadata_test.rb | 59 +++++++++++++++++++++++++ 5 files changed, 109 insertions(+), 23 deletions(-) create mode 100644 lib/git_metadata.rb create mode 100644 test/lib/git_metadata_test.rb diff --git a/app/javascript/layouts/AppLayout.svelte b/app/javascript/layouts/AppLayout.svelte index a38722b1d..8f57f744a 100644 --- a/app/javascript/layouts/AppLayout.svelte +++ b/app/javascript/layouts/AppLayout.svelte @@ -42,7 +42,7 @@ type Footer = { git_version: string; - commit_link: string; + commit_link: string | null; server_start_time_ago: string; heartbeat_recent_count: number; heartbeat_recent_imported_count: number; @@ -594,11 +594,16 @@

- Using Inertia. Build {layout.footer.git_version} + Using Inertia. Build + {#if layout.footer.commit_link} + {layout.footer.git_version} + {:else} + {layout.footer.git_version} + {/if} from {layout.footer.server_start_time_ago} ago. {footerStatsText()}

{#if layout.show_stop_impersonating} diff --git a/app/views/layouts/application.html.erb b/app/views/layouts/application.html.erb index 08038434c..5ede36622 100644 --- a/app/views/layouts/application.html.erb +++ b/app/views/layouts/application.html.erb @@ -233,7 +233,13 @@