postfix_exporter/lib/collector/postfix.rb

416 lines
23 KiB
Ruby
Raw Normal View History

class Collector::Postfix
class Noqueue
def initialize store, prometheus
@store = store
@noqueue = prometheus.counter :total, docstring: 'Total noqueued by reasons.', labels: %i[reason]
@codes = prometheus.counter :status_code_total, docstring: 'Total noqueued by status code', labels: %i[status_code enhanced_status_code]
['dnsbl', 'no reverse hostname', 'user does not exist', '<any>'].each {|r| @noqueue.increment by: 0, labels: {reason: r} }
end
def collect entry
case entry.message
when /\ANOQUEUE: reject: RCPT from (?:[^ ]+): (\d+) (\d+\.\d+\.\d+) (?:[^ ]+): (.*?),/
code, enh = $1, $2
@codes.increment labels: {status_code: code, enhanced_status_code: enh}
end
case entry.message
when /\ANOQUEUE: reject: RCPT from .* blocked using /
@noqueue.increment labels: {reason: 'dnsbl'}
when /\ANOQUEUE: reject: RCPT from .* Message rejected due to: SPF fail - not authorized\. /
@noqueue.increment labels: {reason: 'spf fail'}
when / Client host rejected: cannot find your reverse hostname/
@noqueue.increment labels: {reason: 'no reverse hostname'}
when / User doesn't exist: /
@noqueue.increment labels: {reason: 'user does not exist'}
else
STDERR.puts "# #{entry._systemd_unit} #{entry.syslog_identifier} NOQUEUE: #{entry.message}"
@noqueue.increment labels: {reason: '<any>'}
end
end
end
class Cache
def initialize store, prometheus
@store = store
2022-05-23 14:17:11 +02:00
@full_cleanup = prometheus.counter :full_cleanup_total, docstring: 'A counter of total cache cleanups', labels: %i[file]
@full_cleanup_retained = prometheus.gauge :full_cleanup_retained_entries, docstring: 'Retained entries of last cache cleanups', labels: %i[file]
@full_cleanup_dropped = prometheus.counter :full_cleanup_dropped_entries_total, docstring: 'Total dropped entries of cache cleanups', labels: %i[file]
end
def collect entry
case entry.message
when /\Acache (.*) full cleanup: retained=(\d+) dropped=(\d+) entries/
# postscreen: cache lmdb:/var/lib/postfix/postscreen_cache full cleanup: retained=128 dropped=14 entries
file, retained, dropped = $1, $2.to_f, $3.to_f
@full_cleanup.increment labels: {file: file}
@full_cleanup_retained.increment by: retained, labels: {file: file}
@full_cleanup_dropped.increment by: dropped, labels: {file: file}
else
STDERR.puts "# #{entry._systemd_unit} #{entry.syslog_identifier} cache: #{entry.message}"
end
end
end
class Postscreen
def initialize store, prometheus
@store = store
@noqueue = Noqueue.new store, prometheus.prefix_proxy( :noqueue)
@cache = Cache.new store, prometheus.prefix_proxy( :cache)
2022-05-23 14:17:11 +02:00
@connect_from = prometheus.counter :connect_from_total, docstring: 'A counter of connections to postscreen'
@whitelisted = prometheus.counter :whitelisted_total, docstring: 'A counter of WHITELISTED connections to postscreen'
@pass_old = prometheus.counter :pass_old_total, docstring: 'A counter of PASS OLD connections to postscreen'
@pass_new = prometheus.counter :pass_new_total, docstring: 'A counter of PASS NEW connections to postscreen'
@dnsbl = prometheus.counter :dnsbl_total, docstring: 'A counter of DNSBL-blocked to postscreen'
@bare_newline = prometheus.counter :bare_newline_total, docstring: 'A counter of BARE NEWLINE-blocked to postscreen'
@command_pipelining = prometheus.counter :command_pipelining_total, docstring: 'A counter of COMMAND PIPELINING-blocked to postscreen'
@command_time_limit = prometheus.counter :command_time_limit_total, docstring: 'A counter of COMMAND TIME LIMIT-blocked to postscreen'
@hangup = prometheus.counter :hangup_total, docstring: 'A counter of HANGUP to postscreen'
@bdat = prometheus.counter :bdat_total, docstring: 'A counter of BDAT to postscreen'
@pregreet = prometheus.counter :pregreet_total, docstring: 'A counter of PREGREET to postscreen'
@disconnect = prometheus.counter :disconnect_total, docstring: 'A counter of DISCONNECT to postscreen'
@unknown = prometheus.counter :unknown_total, docstring: 'A counter of unknown loglines by postscreen'
@warnings = prometheus.counter :warnings_total, docstring: 'A counter of any warnings'
@psc_cache_update_delay = prometheus.summary :psc_cache_update_delay_total, docstring: 'A counter of PSC cache update delays by file', labels: %i[file]
@curr_unavailable = prometheus.counter :service_currently_unavailable_total, docstring: 'A counter for rejected mails, because service currently unavailable - so greylisted.'
@dnsblog_reply_timeout = prometheus.counter :dnsblog_reply_timeout_total, docstring: 'Total timedout requests for dnsblog'
@data_without_valid_rcpt = prometheus.counter :data_without_valid_rcpt_total, docstring: 'A counter of DATA without valid RCPT events'
end
def collect entry
#STDERR.puts "postscreen: #{entry.message}"
case entry.message
2022-05-23 14:17:11 +02:00
when /\ACONNECT from / then @connect_from.increment
when /\AWHITELISTED / then @whitelisted.increment
when /\APASS OLD / then @pass_old.increment
when /\APASS NEW / then @pass_new.increment
when /\ADISCONNECT / then @disconnect.increment
when /\APREGREET / then @pregreet.increment
when /\ABDAT / then @bdat.increment
when /\AHANGUP / then @hangup.increment
when /\ADNSBL rank / then @dnsbl.increment
when /\ABARE NEWLINE / then @bare_newline.increment
when /\ACOMMAND PIPELINING / then @command_pipelining.increment
when /\ACOMMAND TIME LIMIT / then @command_time_limit.increment
when /\ADATA without valid RCPT / then @data_without_valid_rcpt.increment
when /\Acache / then @cache.collect entry
when /\ANOQUEUE: (.*)/
case msg = $1
when /\Areject: RCPT from [^ ]+: 450 4.3.2 Service currently unavailable; /
@curr_unavailable.increment
else
@noqueue.collect entry
end
when /\Awarning: (.*)/
@warnings.increment
case $1
when /\Apsc_cache_update: ([^ ]+) update average delay is ([^ ]+)/
@psc_cache_update_delay.observe $2.to_f, labels: {file: $1}
when /\Adnsblog reply timeout /
@dnsblog_reply_timeout.increment
when /\Agetpeername: Transport endpoint is not connected -- dropping this connection/
else
STDERR.puts "# postscreen warnings: #{entry.message}"
end
else
STDERR.puts "# postscreen: #{entry.message}"
@unknown.increment
end
end
end
def self.tls_posibilities
%w[Trusted Untrusted Anonymous].each do |trust|
%w[TLSv1.2 TLSv1.3].each do |tls|
%w[TLS_AES_128_GCM_SHA256 TLS_AES_256_GCM_SHA384 ECDHE-RSA-AES128-GCM-SHA256 ECDHE-RSA-AES256-GCM-SHA384].each do |cipher|
yield trust, tls, cipher
end
end
end
end
class Smtp
def initialize store, prometheus
@store = store
2022-05-23 14:17:11 +02:00
@connection_refused = prometheus.counter :connection_refused_total, docstring: 'A counter of refused connections on smtp'
@connection_timed_out = prometheus.counter :connection_timed_out_total, docstring: 'A counter of timed out connections on smtp'
2022-05-23 14:17:11 +02:00
@tls = prometheus.counter :tls_total, docstring: 'A counter of TLS connections on smtp with TLS-version and cipher', labels: %i[trust tls cipher]
Collector::Postfix.tls_posibilities {|t, s, c| @tls.increment by: 0, labels: {trust: t, tls: s, cipher: c} }
2022-05-23 14:17:11 +02:00
@status = prometheus.histogram :status, docstring: 'A histogram of message status by status', labels: %i[status]
@sent = prometheus.counter :sent_total, docstring: 'A counter of sent messages by smtp'
@deferred = prometheus.counter :deferred_total, docstring: 'A counter of deferred messages by smtp'
@bounced = prometheus.counter :bounced_total, docstring: 'A counter of bounced messages by smtp'
@deliverable = prometheus.counter :deliverable_total, docstring: 'A counter of deliverable messages by smtp'
@undeliverable = prometheus.counter :undeliverable_total, docstring: 'A counter of undeliverable messages by smtp'
@status_unknown = prometheus.counter :status_unknown_total, docstring: 'A counter of unknown status by smtp'
2022-05-23 14:17:11 +02:00
@unknown = prometheus.counter :unknown_total, docstring: 'A counter of unknown loglines by smtp'
end
def collect entry
#STDERR.puts "smtp: #{entry.message}"
case entry.message
when /\Aconnect to /
case entry.message
2022-05-23 14:17:11 +02:00
when / Connection refused\z/ then @connection_refused.increment
when / Connection timed out\z/ then @connection_timed_out.increment
else
STDERR.puts "# #{entry._systemd_unit} #{entry.syslog_identifier} connect to: #{entry.message}"
end
when /\A([^ ]+) TLS connection established to .* ([^ ]+) with cipher ([^ ]+)/
@tls.increment labels: {trust: $1, tls: $2, cipher: $3}
when /\A\w+: .* delay=([0-9.]+),.*status=([^ ]+)/
# postfix@-.service: postfix/smtp/smtp 4KZ0KY5Wx4z4Mn: to=<wegmann@psi.co.at>, relay=mail.psi.co.at[81.223.32.197]:25, delay=0.76, delays=0.2/0/0.35/0.21, dsn=2.0.0, status=sent (250 2.0.0 Ok: queued as 710F7A0263)
delay, status = $1.to_f, $2.downcase
@status.observe delay, labels: {status: status}
case status
2022-05-23 14:17:11 +02:00
when 'sent' then @sent.increment
when 'deferred' then @deferred.increment
when 'bounced' then @bounced.increment
when 'deliverable' then @deliverable.increment
when 'undeliverable' then @undeliverable.increment
else
STDERR.puts "# #{entry._systemd_unit} #{entry.syslog_identifier} status: #{entry.message}"
@status_unknown.increment
end
else
STDERR.puts "# #{entry._systemd_unit} #{entry.syslog_identifier}: #{entry.message}"
@unknown.increment
end
end
end
class Smtpd
def initialize store, prometheus
@store = store
@noqueue = Noqueue.new store, prometheus.prefix_proxy( :noqueue)
@connect_from = prometheus.counter :connect_from_total, docstring: 'A counter of connections to smtpd', labels: %i[from_unknown]
%w[0 1].each {|x| @connect_from.increment by: 0, labels: {from_unknown: x} }
@tls = prometheus.counter :tls_total, docstring: 'A counter of TLS connections to smtpd with TLS-version and cipher', labels: %i[trust tls cipher]
Collector::Postfix.tls_posibilities {|t, s, c| @tls.increment by: 0, labels: {trust: t, tls: s, cipher: c} }
@disconnect_from = prometheus.counter :disconnect_from_total, docstring: 'A counter of "disconnect from" to smtpd'
@disconnect_from_events = prometheus.counter :disconnect_from_event_total, docstring: 'A counter of events while connection-lifetime (auth, starttls, mail, rcpt, data, commands, ...) from disconnections', labels: %i[event]
%w[auth mail rcpt data commands starttls ehlo quit].each {|e| @disconnect_from_events.increment by: 0, labels: {event: e} }
@concurrenty_limit_exceeded = prometheus.counter :concurrenty_limit_exceeded_total, docstring: 'A counter of concurrenty limit exceeded connections to smtpd'
@timeout = prometheus.counter :timeout_connection_total, docstring: 'A counter of timedout connections to smtpd', labels: %i[after]
@lost_connection = prometheus.counter :lost_connection_total, docstring: 'A counter of lost connections to smtpd', labels: %i[after]
@accepted = prometheus.counter :accepted_total, docstring: 'A counter of accepted messages to smtpd'
@unknown = prometheus.counter :unknown_total, docstring: 'A counter of unknown loglines by smtpd'
@sasl_auth_failed = prometheus.counter :sasl_auth_failed_total, docstring: 'A counter of failed SASL authentication by method', labels: %i[method]
@non_smtp_command = prometheus.counter :non_smtp_command_total, docstring: 'A counter of Non-SMTP-commands (ex. was a HTTP GET / HTTP/1.1)'
@tls_lib_problem = prometheus.counter :tls_lib_problem_total, docstring: 'A counter of TLS libreary problems (ex. unsupported protocol'
@ssl_error = prometheus.counter :ssl_error_total, docstring: 'A counter of any SSL_accept errors by error', labels: %i[error]
@warnings = prometheus.counter :warnings_total, docstring: 'A counter of any warnings'
%w[].each {|m| @sasl_auth_failed.increment by: 0, labels: {method: m} }
@hostname_not_resolved_to_address = prometheus.counter :hostname_not_resolved_to_address_total, docstring: 'A counter of hostnames, which cannot be resolved to there IP'
end
def collect entry
#STDERR.puts "smtpd: #{entry.message}"
case entry.message
when /\Aconnect from unknown/
@connect_from.increment labels: {from_unknown: 1}
when /\Aconnect from /
@connect_from.increment labels: {from_unknown: 0}
when /\A([^ ]+) TLS connection established from .*: ([^ ]+) with cipher ([^ ]+) /
@tls.increment labels: {trust: $1, tls: $2, cipher: $3}
when /\ANOQUEUE: /
@noqueue.collect entry
when /\Adisconnect from ([^ ]+) (.*)/
@disconnect_from.increment
# ehlo=2 starttls=1 auth=1 mail=1 rcpt=1 data=1 commands=8
$2.split( ' ').each do |x|
case x
when /(\w+)=(\d+)/
@disconnect_from_events.increment by: $2.to_f, labels: {event: $1}
else
STDERR.puts "# #{entry._systemd_unit} #{entry.syslog_identifier} disconnect from: [#{x}] #{entry.message}"
end
end
when /\Awarning: (.*)/
@warnings.increment
case $1
when /\AConnection concurrency limit exceeded: /
@concurrenty_limit_exceeded.increment
when /\Ahostname ([^ ]+) does not resolve to address /
@hostname_not_resolved_to_address.increment
when /\A[^ ]+\[([^ ]+)\]: SASL ([^ ]+) authentication failed/
@sasl_auth_failed.increment labels: {method: $1.downcase}
when /\Anon-SMTP command from /
@non_smtp_command.increment
when /\ATLS library problem /
@tls_lib_problem.increment
else
STDERR.puts "# #{entry._systemd_unit} #{entry.syslog_identifier} wanring: #{entry.message}"
end
when /\ASSL_accept error from [^ ]+: (-?\d)/
@ssl_error.increment labels: {error: $1.to_i}
when /\Atimeout after ([^ ]+) from /
@timeout.increment labels: {after: $1}
when /\Alost connection after ([^ ]+) from /
@lost_connection.increment labels: {after: $1}
when /\A\w{8,15}: client=/ # sasl_method=
@accepted.increment
else
STDERR.puts "# #{entry._systemd_unit} #{entry.syslog_identifier}: #{entry.message}"
@unknown.increment
end
end
end
class Tlsproxy
def initialize store, prometheus
@store = store
2022-05-23 14:17:11 +02:00
@connect = prometheus.counter :connect_total, docstring: 'A counter of COUNNECT from tlsproxy'
@disconnect = prometheus.counter :disconnect_total, docstring: 'A counter of DISCOUNNECT from tlsproxy'
2022-05-23 14:17:11 +02:00
@tls = prometheus.counter :tls_total, docstring: 'A counter of TLS connections to smtpd with TLS-version and cipher', labels: %i[trust tls cipher]
Collector::Postfix.tls_posibilities {|t, s, c| @tls.increment by: 0, labels: {trust: t, tls: s, cipher: c} }
end
def collect entry
case entry.message
2022-05-23 14:17:11 +02:00
when /\ADISCONNECT / then @disconnect.increment
when /\ACONNECT / then @connect.increment
when /\A([^ ]+) TLS connection established from .*: ([^ ]+) with cipher ([^ ]+) /
@tls.increment labels: {trust: $1, tls: $2, cipher: $3}
else
STDERR.puts "# #{entry._systemd_unit} #{entry.syslog_identifier}: #{entry.message}"
end
end
end
class Bounce
def initialize store, prometheus
@store = store
2022-05-23 14:17:11 +02:00
@non_delivery = prometheus.counter :sender_non_delivery_notification, docstring: 'A counter of notifications to sender because mail cannot deliveried'
end
def collect entry
case entry.message
when /\A([^ ]+): sender non-delivery notification: (.+)/
# postfix@-.service: postfix/bounce/bounce 4L3cN05mX6zfB: sender non-delivery notification: 4L3cN06Qpkz4Fc
@non_delivery.increment
else
STDERR.puts "# #{entry._systemd_unit} #{entry.syslog_identifier}: #{entry.message}"
end
end
end
class SCache
def initialize store, prometheus
@store = store
2022-05-23 14:17:11 +02:00
@start = prometheus.gauge :start, docstring: 'Start timestamp of SCache statistics'
@domain_lookup_hits = prometheus.counter :domain_lookup_hits_total, docstring: 'Count of hits for domain lookups of SCache'
@domain_lookup_miss = prometheus.counter :domain_lookup_miss_total, docstring: 'Count of misses for domain lookups of SCache'
@domain_lookup_success_rate = prometheus.gauge :domain_lookup_success_rate, docstring: 'Success rate of domain lookups of SCache'
@max_simultaneaus_domains = prometheus.gauge :max_simultaneaus_domains, docstring: 'Max simultaneaus connections Domains of SCache'
@max_simultaneaus_addresses = prometheus.gauge :max_simultaneaus_addresses, docstring: 'Max simultaneaus connections Addresses of SCache'
@max_simultaneaus_connections = prometheus.gauge :max_simultaneaus_connections, docstring: 'Max simultaneaus connections of SCache'
end
def collect entry
case entry.message
when /\Astatistics: start interval (.*)/
# postfix@-.service: postfix/scache/scache statistics: start interval Apr 13 10:58:06
@start.set Time.parse( $1).to_f
when /\Astatistics: domain lookup hits=(\d+) miss=(\d+) success=(\d+)%/
# postfix@-.service: postfix/scache/scache statistics: domain lookup hits=2 miss=4 success=33%
@domain_lookup_hits.increment by: $1.to_f
@domain_lookup_miss.increment by: $2.to_f
@domain_lookup_success_rate.set $3.to_f/100
when /\Astatistics: max simultaneous domains=(\d+) addresses=(\d+) connection=(\d+)/
# postfix@-.service: postfix/scache/scache statistics: max simultaneous domains=1 addresses=1 connection=4
@max_simultaneaus_domains.set $1.to_f
@max_simultaneaus_addresses.set $2.to_f
@max_simultaneaus_connections.set $3.to_f
else
STDERR.puts "# #{entry._systemd_unit} #{entry.syslog_identifier}: #{entry.message}"
end
end
end
class Verify
def initialize store, prometheus
@store = store
@cache = Cache.new store, prometheus.prefix_proxy( :cache)
end
def collect entry
case entry.message
2022-05-23 14:17:11 +02:00
when /\Acache / then @cache.collect entry
else
STDERR.puts "# #{entry._systemd_unit} #{entry.syslog_identifier}: #{entry.message}"
end
end
end
def initialize store, prometheus
@store = store
@postscreen = Postscreen.new store, prometheus.prefix_proxy( :postscreen)
@smtp = Smtp.new store, prometheus.prefix_proxy( :smtp)
@smtpd = Smtpd.new store, prometheus.prefix_proxy( :smtpd)
@submission = Smtpd.new store, prometheus.prefix_proxy( :submission)
@tlsproxy = Tlsproxy.new store, prometheus.prefix_proxy( :tlsproxy)
@scache = SCache.new store, prometheus.prefix_proxy( :scache)
@bounce = Bounce.new store, prometheus.prefix_proxy( :bounce)
@verify = Verify.new store, prometheus.prefix_proxy( :verify)
2022-05-23 14:17:11 +02:00
@qmgr = prometheus.counter :qmgr_total, docstring: 'A counter of qmgr actions'
@cleanup = prometheus.counter :cleanup_total, docstring: 'A counter of cleanup actions'
@lmtp = prometheus.counter :lmtp_total, docstring: 'A counter of ltmp actions'
@pickup = prometheus.counter :pickup_total, docstring: 'A counter of pickup actions'
@dnsblog = prometheus.counter :dnsblog_total, docstring: 'A counter for DNS-Blacklisted IP by DNSBL', labels: %i[dnsbl]
@spf_fail = prometheus.counter :spf_fail_total, docstring: 'A counter of policyd-SPF failed SPF'
2022-05-23 14:17:11 +02:00
@spf = prometheus.counter :spf_total, docstring: 'A counter of prepended SPF header-lines with state', labels: %i[status]
%w[pass none].each {|s| @spf.increment by: 0, labels: {status: s} }
2022-05-23 14:17:11 +02:00
@anvil_max_connection_rate_per_minute = prometheus.gauge :anvil_max_connection_per_minute_rate, docstring: "Rate of max connections per minute to listener", labels: %i{listener}
@anvil_max_connection_count = prometheus.gauge :anvil_max_connection_count, docstring: "Count of max connections to listener", labels: %i{listener}
@anvil_max_cache_size = prometheus.gauge :anvil_max_cache_size, docstring: "Current max cache size"
end
def collect entry
#STDERR.puts "postfix: #{entry.syslog_identifier}: #{entry.message}"
case entry.syslog_identifier
2022-05-23 14:17:11 +02:00
when 'postfix/tlsproxy', 'postfix/tlsproxy/tlsproxy' then @tlsproxy.collect entry
when 'postfix/postscreen', 'postfix/smtp/postscreen' then @postscreen.collect entry
when 'postfix/smtp', 'postfix/smtp/smtp' then @smtp.collect entry
when 'postfix/smtpd', 'postfix/smtpd/smtpd' then @smtpd.collect entry
when 'postfix/submission/smtpd' then @submission.collect entry
when 'postfix/bounce', 'postfix/bounce/bounce' then @bounce.collect entry
when 'postifx/verify', 'postfix/verify/verify' then @verify.collect entry
when 'postfix/cleanup', 'postfix/cleanup/cleanup' then @cleanup.increment
when 'postfix/qmgr', 'postfix/qmgr/qmgr' then @qmgr.increment
when 'postfix/lmtp', 'postfix/lmtp/lmtp' then @lmtp.increment
when 'postfix/pickup', 'postfix/pickup/pickup' then @pickup.increment
when 'postfix/scache', 'postfix/scache/scache' then @scache.collect entry
when 'postfix/dnsblog', 'postfix/dnsblog/dnsblog'
case entry.message
when /\Aaddr [^ ]+ listed by domain ([^ ]+) as /
@dnsblog.increment labels: {dnsbl: $1.downcase}
else
STDERR.puts "# #{entry._systemd_unit} #{entry.syslog_identifier}: #{entry.message}"
end
when 'postfix/anvil', 'postfix/anvil/anvil'
case entry.message
when /\Astatistics: max connection rate (\d+)\/60s for \((.+)\) at (.*)/
@anvil_max_connection_rate_per_minute.set $1.to_i, labels: {listener: $2}
when /\Astatistics: max connection count (\d+) for \((.+)\) at (.*)/
@anvil_max_connection_count.set $1.to_i, labels: {listener: $2}
when /\Astatistics: max cache size (\d+) at (.*)/
@anvil_max_cache_size.set $1.to_i
else
STDERR.puts "# #{entry._systemd_unit} #{entry.syslog_identifier}: #{entry.message}"
end
2022-05-23 14:17:11 +02:00
when 'policyd-spf'
case entry.message
when /\Aprepend Received-SPF: (\w+) /
@spf.increment labels: {status: $1.downcase}
when /\A[^ ]+ [^ ]+ Message rejected due to: SPF fail /
# postfix@-.service policyd-spf: 550 5.7.23 Message rejected due to: SPF fail - not authorized.
@spf_fail.increment
else
STDERR.puts "# #{entry._systemd_unit} #{entry.syslog_identifier}: #{entry.message}"
end
else
STDERR.puts "# #{entry._systemd_unit}: #{entry.syslog_identifier} #{entry.message}"
end
end
end