File Manager
Current Path : /usr/share/crm114/ |
|
Current File : //usr/share/crm114/mailfilter.crm |
#! /usr/bin/crm
# -(learnspam learnnonspam learnfile stats_only config spamcss nonspamcss fileprefix force unlearn cache)
#
# mailfilter.crm - Statistical mail sorter
#
# Note to SunOS and FreeBSD users - do not place command arguments of
# "-([arguments])" format on the first line of this program
# or you will not get what you expect. This is due to a kernel
# difference in how a bangline should be dealt with.
# Copyright 2009 William S. Yerazunis.
# This file is under GPLv3, as described in COPYING.
# A statistical mail sorter with mail-to-yourself commanding
#
# --->>> Design Philosophy ( do these IN ORDER )
#
# * if --fileprefix is specified, all filenames EXCEPT --config
# are prefixed with that (You need a trailing slash on the prefix
# if it is a directory name.)
# * if --config , grab the config file from the specified place.
# * Load the mailfilter.cf config file from wherever config or
# fileprefix points (--config overrides --fileprefix).
# * If --spamcss is specified, use that as the spam.css file
# * If --nonspamcss is specified, use that as the nonspam.css file
# * If --learnspam, learn as spam and exit.
# * If --learnnonspam, learn as nonspam and exit
# * If --force, force-feed the learning
# * If --learnfile, use :learnfile:.css and :learnfile:text.txt
# * If --stats_only, do normal classification but don't do any
# forwarding, only output the status on stdout and return
# the exit code.
#
# * check for the "command word", if present, execute the command
#
# * check to see if any of the whitelist patterns apply. If so,
# accept the mail to /var/spool/the_user (which is actually to
# be found at /var/spool/mail/:*:_env_USER:
#
# * check to see if any of the blacklist patterns apply. If so,
# flush the mail to the "blacklisted" file.
#
# * check to see if it's commanded to be learned as a spam or a
# nonspam model. If so, learn it to the appropriate .css (Crm
# Sparse Spectra) file
#
# * run the email through the classifier. If the classifier thinks
# it's good, send it to the /var/spool/mail/the_user file, else
# send it to the "doubtful" file.
#
##############################################################
#
# --- uncomment this if you want to include a "forced"
# configuration file ---
# insert mailfilterconfig.crm
#
#
# --- These vars must have a value, or else we'll get errors ----
#
isolate (:classifier_reason:) /no reason yet/
#
isolate (:classify_status:) //
#
isolate (:our_exit_code:) /0/
#
isolate (:stats:) / pR: 0.000000 /
#
isolate (:pr:) / pR: 0.00000/
#
isolate (:subj_text:) / (None) /
#
isolate (:add_extra_stuff:) //
#
isolate (:decision_length:) /4096/
#
isolate (:cache:) // <default>
isolate (:cachedir:) //
isolate (:cacheid:) //
isolate (:msg_hash:) //
isolate <default> (:learnspam:)
isolate <default> (:learnnonspam:)
isolate <default> (:learnfile:)
isolate <default> (:stats_only:)
isolate <default> (:automatic_training:)
#
# Isolate these email addresses, and give them values,
# in case the user doesn't.
isolate (:reject_address:) //
isolate (:fail_priority_mail_to:) //
isolate (:fail_blacklist_mail_to:) //
isolate (:fail_SSM_mail_to:) //
isolate (:log_rejections:) //
#
# this ISOLATE will guarantee that :fileprefix: exists, and keep it's
# prior (commandline) value if it does, and an empty string if it doesnt
isolate (:fileprefix:)
#
# This ISOLATE will guarantee that :force: will exist, and keep the
# commandline value ("SET") , or the null string if the user doesn't
# use --force on the command line.
isolate (:force:)
#
# This ISOLATE will guarantee that :unlearn: will exist, and will keep
# the commandline value ("SET") or the null string if the user doesn't
# use --unlearn on the command line.
isolate (:unlearn:)
#
# now, :clf: is the classify & learn flags; note that we have two
# separate flags here in a bizarre chain. The reason is that :unlearn:
# can have the value "SET", whereas :rft: needs "refute"
isolate (:clf:) //
#
# and someplace to catch mailtrainer if we need it.
isolate (:mailtrainer_output:) //
#
#####################################################################
#
# This is the code to read the per-user configuration. Note
# that because this happens during the run, it will _override_
# any comand line arguments that get set.
{
isolate (:option_txt:)
isolate (:ev:)
isolate (:verbose_startup:)
isolate (:config:)
#
# Part 1 - read in the options/configuration file
#
{
{
match [:config:] /.+/
input [:*:config:] (:option_txt:)
}
alius
{
# read in the standard mail filter configuration file.
input [:*:fileprefix:mailfilter.cf] (:option_txt:)
}
}
#
#
# reset loop for matching to start of :option_txt:
match [:option_txt:] //
# and loop till there are no more options.
{
# find a line that looks like a parameter setting...
match < fromend nomultiline > (:line: :name: :value:) \
[:option_txt:] /^[ ]*(:[[:graph:]]+:)[ \t]+\/(.*)\//
{
# don't execute the assign if there's
# a # at the start of the line.
match <absent> [:name:] /^\x23/
{
# Verbose startup?
match [:verbose_startup:] /SET/
output / :*:name:\n :*:value:\n/
}
isolate (:*:name:) /:*:value:/
}
liaf
}
}
#
# Now, a tricky bit - we need to add "unlearn" to the :clf:
# if it was in the line params - but we have to append, not replace,
# because :clf: (CLassfier Flags) also contains the classifier we use.
#
{
match [:unlearn:] /SET/
alter (:clf:) /:*:clf: refute/
}
#
#
# Do a quick check- has the password been changed or not? If it's
# still the default, put in something that will be well-nigh unguessable
# (esp. since it will contain recieved headers that the sender cannot
# see nor control.)
{
match [:spw:] /DEFAULT_PASSWORD/
# yes, it's the same as default. So we scramble it just so
# nobody can hack in
hash (:spw:) /:*:_env_string::*:_dw:/
}
#############################################################
#
# Set up the addresses that we might need to mail to
#
isolate (:reject_address:) /:*:general_fails_to:/
{
match [:fail_priority_mail_to:] <absent> /[[:graph:]]/
alter (:fail_priority_mail_to:) /:*:general_fails_to:/
}
{
match [:fail_blacklist_mail_to:] <absent> /[[:graph:]]/
alter (:fail_blacklist_mail_to:) /:*:general_fails_to:/
}
{
match [:fail_SSM_mail_to:] <absent> /[[:graph:]]/
alter (:fail_SSM_mail_to:) /:*:general_fails_to:/
}
###############################################################
# Does the user want us to log all incoming mail? This is handy for
# testing and auditing purposes.
{
match [:log_to_allmail.txt:] /yes/
output [:*:fileprefix:allmail.txt] <append> /:*:_dw:/
}
###############################################################
#
# Is text cacheing turned on?
{
match [:text_cache:] /./
{
### If the text_cache dir isn't there, create it
# and it's subdirectories.
#
isolate (:tmp:) //
syscall () (:tmp:) /ls :*:text_cache: 2>&1 /
match [:tmp:] <absent> /texts/
syscall () () /mkdir -p :*:text_cache: /
syscall () () /mkdir -p :*:text_cache:\/texts /
syscall () () /mkdir :*:text_cache:\/prob_good /
syscall () () /mkdir :*:text_cache:\/prob_spam /
syscall () () /mkdir :*:text_cache:\/known_good /
syscall () () /mkdir :*:text_cache:\/known_spam /
syscall () () /mkdir :*:text_cache:\/empty /
}
#
# Yes, text cacheing is on. Save the input as is in the cache.
#
isolate (:system_time:) //
syscall () (:system_time:) /date +%Y%m%d_%H%M%S_%N /
match [:system_time:] (:: :cacheid:) /([[:graph:]]+)..../
hash (:msg_hash:) /:*:_dw:/
alter (:cacheid:) /sfid-:*:cacheid:_:*:msg_hash:/
# As long as this isn't a "learn" run, nor a "stats-only" run,
# we should save the text of this message in the text cache directory.
# Note to self: eventually this should also work with the
# command <password> spam nonspam stuff
{
match <absent> [:stats_only:] /SET/
match <absent> [:learnspam:] /SET/
match <absent> [:learnnonspam:] /SET/
output [:*:text_cache:\/texts\/:*:cacheid:] /:*:_dw:/
}
}
##############################################################
#
# Grab the text that we're going to actually work with.
#
# We copy this into m_text - the "mutilated text". It
# will become an annotated _copy_ of the incoming text,
# with whatever changes we think will help us classify better.
#
# We clip m_text to be the first :decision_length: characters of
# the incoming mail.
#
match (:m_text:) [:_dw: 0 :*:decision_length:] /.*/
isolate (:m_text:)
#
# :b_text: is the text with base64's expanded.
isolate (:b_text:) /:*:m_text:/
#
# :i_text: is the text with Hypertextus Interruptus removed.
isolate (:i_text:) /:*:m_text:/
#
#
# To start with, the commanded text is assumed to be the entire input.
# THEN
# If there's a command followed by text, we save the text so we can
# put that, and _only_ that, into the .txt corpi.
{
isolate (:cmd_txt:) /:*:_dw:/
match (:: :cmd_txt:) [:_dw:] /command :*:spw: [^\n]*\n(.*)/
}
#
#
# do we do any expansions?
{
# expansion 1: - do we perform base64 expansions?
{
{
match [:do_base64:] /yes/
{
# yes, expand base64's if there are any
#
# Note: some spams don't even bother to use
# a 'Content-Transfer-Encoding' marker,
# and even fewer use Content-Type: text/whatever
# so we have to sort of wing it, when to expand
# what _might_ be base64 and when to ignore it.
# For now, if it says it's a base64, it gets
# expanded, no matter what the type. Maybe
# someday someone will put in a lockout for
# things like .jpg files, .doc files, etc.
#
isolate (:exp_text:)
match [:b_text:] <nocase> (:a: :h: :b:) \
/(Content-Transfer-Encoding): base64(.*)/
match (:c:) [:b:] \
/([a-zA-Z0-9+=!\/]+:*:_nl:){2,200}/
#
syscall (:*:c:) (:exp_text:) /:*:mime_decoder: /
# and stuff the result back into b_text for
# classification right in context.
alter (:c:) /:*:exp_text:/
# and mark this piece of mime as "prior".
alter (:h:) /Content-Transfer-Prior-Encoding/
# repeat till no more Mime base64 encodings
liaf
}
}
alius
{
# if no base64 expansions enabled, empty out :b_text:
#
alter (:b_text:) //
}
}
#
# If we had expansions, bust the html contents out of them, otherwise
# ignore b_text as it's redundant
{
{
match [:b_text:] <nocase> /Content-Transfer-Prior-Encoding/
alter (:i_text:) /:*:b_text:/
}
alius
{
# if :b_text: _didn't_ have a base64, it's useless
alter (:b_text:) //
}
}
# expansion 2 : do we bust HTML comments ( a.k.a.
# hypertextus interruptus) out?
{
match [:undo_interruptus:] /yes/
isolate (:commentbin:) //
{
match [:i_text:] (:comment:) /<!--([^-]|-[^-]|--[^>])*-->/
alter (:commentbin:) /:*:commentbin: :*:comment:/
alter (:comment:) //
liaf
}
# if we had at least 80 characters worth of comments, then
# it's worth using the decommented text, else not.
# (this my personal judgement call)
{
{
match [:commentbin:] /(.){80,}/
}
alius
{
alter (:i_text:) //
}
}
}
}
# and reassemble the mucked-over text into the :m_text: var, always
# with the base64's expanded, then a second decommented copy
#
{
isolate (:m_text:) \
/:*:m_text: :*:_nl: :*:b_text: :*:_nl: :*:i_text: :*:_nl:/
}
#########################################################
#
# Do we want to do any rewrites before running?
#
{
match [:rewrites_enabled:] /yes/
isolate (:rewrites:)
input (:rewrites:) [:*:fileprefix:rewrites.mfp]
# reset matching on rewrites to start of string - if no string, no more
# processing of rewrites !!
match [:rewrites:] //
#
#
{
# Grab the next regex; turn the one-per-line patterns into a
# regex and a replacement string.
# First, do the line-spanning regexes.
match <fromend nomultiline> (:ch: :fr: :to:) [:rewrites:] /(.+)>-->(.*)/
# see if the "fr" regex matches anywhere
{
match [:m_text:] (:place:) /:*:fr:/
# Yep, it matched... alter it and do it again
#
alter (:place:) /:*:to:/
liaf
}
# Nope, didn't match... grab the next regex and try again,
liaf
}
#
# reset back to the start of the rewrites.
#
match [:rewrites:] //
#
# and do it again for non-line-spanners
{
# Go through and do it again, except this time do it for
# the non-line-spanning regexes.
match <fromend nomultiline> (:ch: :fr: :to:) [:rewrites:] /(.+)>->(.*)/
# see if the "fr" regex matches anywhere
{
match [:m_text:] <nomultiline> (:place:) /:*:fr:/
# Yep, it matched... alter it and do it again
#
alter (:place:) /:*:to:/
liaf
}
# Nope, didn't match... grab the next regex and try again,
liaf
}
} # done with rewrites.
###################################################################
#
# Command Dispatch processing starts here
#
# ---------do we have a --learnspam or --learnnonspam command line key?
#
match (:text:) [:m_text:] /.*/
isolate (:c:) //
isolate ( :spamcss: :nonspamcss: )
{
match <absent> [:spamcss:] /./
alter (:spamcss:) /spam.css/
}
{
match <absent> [:nonspamcss:] /./
alter (:nonspamcss:) /nonspam.css/
}
{
match [:learnspam:] /SET/
goto /:learnspamhere:/
}
{
match [:learnnonspam:] /SET/
goto /:learnnonspamhere:/
}
{
match (:trash: :file:) [:learnfile:] /(.+)/
goto /:learntofilehere:/
}
#
#
#------------ Are we enabled for "inoculations" via email?
#
{
match [:inoculations_enabled:] /yes/
#
# see if we have an inoculation header.
#
match <nomultiline> [:m_text:] \
/Inoculation-Sender: ([[:graph:]]+)/ (:x: :sender:)
match <nomultiline> [:m_text:] \
/Inoculation-Type: ([[:graph:]]+)/ (:x: :type:)
match <nomultiline> [:m_text:] \
/Inoculation-Authentication: (.*)$/ (:x: :auth:)
#
# See if the sender is in our list, and if so, what is their secret?
#
isolate (:inoculation_passwd:) //
input (:inoculation_passwd:) [:*:fileprefix:inoc_passwd.txt]
match [:inoculation_passwd:] <nomultiline> \
/:*:sender: :*:type: (.*)/ (:x: :secret:)
#
# We now have the shared secret, calculate the checksum we should have
#
# grab the body...
match /\n\n(.*)/ (:x: :body:)
#
# and calculate the hash.
isolate (:md5out:)
syscall (:*:secret::*:_nl::*:body:) (:md5out:) /md5sum/
match [:md5out:] /([[:graph:]]+)/ (:x: :md5out:)
#
# does this hash match with the given hash?
match [:auth:] /:*:md5out:/
#
# Yes, it matched. It's a real innoculation.
#
# grab the text we want to actually learn (this is the payload)
match [:m_text:] (:x: :text:) /\n\n(.*)/
#
# and learn it appropriately
{
match [:type:] /nonspam/
goto /:learnnonspamhere:/
}
alius
{
match [:type:] /spam/
goto /:learnspamhere:/
}
}
#
# -------------check for the COMMAND WORD ----------
#
{
#
# grab the password as :pw:, and any arg(s) as :c:
#
match <nomultiline> (:z: :pw: :c: ) [:_dw:] /^command ([[:graph:]]+) (.*)/
#
# check the password. If it's invalid, FAIL out of this bracket set
# and just treat this as ordinary (non-command) mail.
match [:pw:] /:*:spw:/
{
# was it a command to add something to the whitelist?
match <nomultiline> (:q: :a:) [:c:] /whitelist (.*)/
output [:*:fileprefix:whitelist.mfp] <append> /:*:a::*:_nl:/
alter (:z:) /*** :*:z: *** :*:_nl:Whitelist command executed! :*:_nl:/
accept
exit /:*:accepted_mail_exit_code:/
}
{
# was it a command to add something to the blacklist?
match <nomultiline> (:q: :a:) [:c:] /blacklist (.*)/
output [:*:fileprefix:blacklist.mfp] <append> /:*:a::*:_nl:/
alter (:z:) /*** :*:z: *** :*:_nl:Blacklist command executed! :*:_nl:/
accept
exit /:*:accepted_mail_exit_code:/
}
#
# Did the user specify command "force"?
#
{
match <nomultiline> [:c:] /force/
#
# yep, so we set the "force" on.
alter (:force:) /SET/
}
#
# Did the user specify command "unlearn"?
#
{
match <nomultiline> [:c:] /unlearn/
#
# yep, so we set the "force" on.
alter (:unlearn:) /SET/
}
#
# Now, if :unlearn: is set, by either bashline or command, we
# set the :clf: flag to be "refute". Otherwise, we set it to
# be what it was before.
#
{
match [:unlearn:] /SET/
alter (:clf:) /:*:clf: refute/
}
#
# Now, the big mahonka. Learn as nonspam, or as spam
# (note the three subpaths - one each for non-forced, forced, and
# non-forced error messages)
#
{
# was it a command to learn something as nonspam?
match [:c:] /nonspam/
match (:z: :text:) [:m_text:] /:*:_nl:command [[:graph:]]+ nonspam(.*)/
# and learn it as nonspam
:learnnonspamhere:
{
# Are we supposed to use the cached version?
{
match [:c:] /cache/
alter (:cache:) /SET/
}
match [:cache:] /SET/ # can also be set on command line
# yes - so we use mailtrainer.crm to do the training
{
match (:: :fileid:) /X-CRM114-CacheID: ([[:graph:]]+)/
# check- does the cached file exist?
syscall () (:tmp:) /ls :*:text_cache:\/texts\/:*:fileid:/
match [:tmp:] /:*:fileid:/
# yes, it exists - go on with the learning method
# and remember this file on a permanent basis
syscall /ln :*:text_cache:\/texts\/:*:fileid: :*:text_cache:\/known_good\/:*:fileid: /
# output / \n DOING: crm mailtrainer.crm --good=:*:text_cache:\/texts\/:*:fileid: --spam=:*:text_cache:\/empty\/ \n / () (:mailtrainer_output:)
syscall /crm mailtrainer.crm --fileprefix :*:fileprefix: --good=:*:text_cache:\/texts\/:*:fileid: --spam=:*:text_cache:\/empty\/ / () (:mailtrainer_output:)
# output /mailtrainer output: ':*:mailtrainer_output:'\n/
# and remove it from the prob_* directories, as
# now it's known
syscall /rm -rf :*:text_cache:\/prob_spam\/:*:fileid:/
syscall /rm -rf :*:text_cache:\/prob_good\/:*:fileid:/
#
# now it's trained; put in a marker in the headers
call /:mungmail_delete:/ [X-CRM114-Status: ]
call /:mungmail_delete:/ [X-CRM114-Unsure: ]
call /:mungmail_add:/ [X-CRM114-Action: LEARNED AND CACHED NONSPAM ]
# Insert the training report into the msgbody if desired
{
match [:add_mailtrainer_report:] /yes/
match (:: :firstline:) /.*(.)/
match (:: :firstline:) /\n\n()/
alter (:firstline:) / :*:mailtrainer_output:\n-----\n/
}
accept
exit /:*:accepted_mail_exit_code:/
}
{
# No, it didn't exist. Add an error message header.
call /:mungmail_add:/ [X-CRM114-ERROR: No cached text with that cacheID, so nothing done!]
accept
exit /:*:accepted_mail_exit_code:/
}
}
{ # No cacheing, so we learn "natively"
#
# Verify that we need to learn this first (TOE strategy)
classify <:*:clf:> [:text:] /:*:lcr:/ \
(:*:fileprefix::*:nonspamcss: :*:fileprefix::*:spamcss: ) \
(:classify_status:)
match [:classify_status:] <nomultiline> \
/^#0.* pR: ([-. 0-9]+)/ (:: :pr:)
eval /:@: :*:pr: < :*:thick_threshold: :/
output [:*:fileprefix:nonspamtext.txt] \
<append> /\n\n:*:cmd_txt:\n/
#
# write out the pre-mutilation text, with newlines
#
learn <:*:clf:> (:*:fileprefix::*:nonspamcss:) [:text:] /:*:lcr:/
call /:mungmail_add:/ [X-CRM114-Action: LEARNED NONSPAM]
call /:mungmail_unique:/ [X-CRM114-Status: Good (Learn)]
accept
exit /:*:accepted_mail_exit_code:/
accept
exit /:*:accepted_mail_exit_code:/
}
alius
{
#
# Did the user specify "--force" on the command line?
match [:force:] /SET/
output [:*:fileprefix:nonspamtext.txt] <append> /\n\n:*:cmd_txt:\n/
#
# write out the pre-mutilation text, with newlines
#
learn < :*:clf: > (:*:fileprefix::*:nonspamcss:) [:text:] /:*:lcr:/
# syscall (:*:_dw:) (:_dw:) \
# /formail -A "X-CRM114-Action: LEARNED NONSPAM (FORCED) :*:clf:"/
call /:mungmail_add:/ [X-CRM114-Action: LEARNED NONSPAM (FORCED)]
call /:mungmail_unique:/ [X-CRM114-Status: Good (Learn)]
accept
exit /:*:accepted_mail_exit_code:/
}
alius
{
call /:mungmail_add:/ [X-CRM114-Action: LEARN AS NONSPAM UNNECESSARY- ALREADY CLASSIFIED CORRECTLY - NO ACTION TAKEN]
accept
exit /:*:accepted_mail_exit_code:/
}
}
{
# was it a command to learn something as spam?
match [:c:] /spam/
match (:z: :text:) [:m_text:] /:*:_nl:command [[:graph:]]+ spam(.*)/
# and learn it as spam
:learnspamhere:
{
# Are we supposed to use the cached version?
{
match [:c:] /cache/
alter (:cache:) /SET/
}
match [:cache:] /SET/ # can also be set on command line
# check- does the cached file exist?
{
# yes - so we use mailtrainer.crm to do the training
match (:: :fileid:) /X-CRM114-CacheID: ([[:graph:]]+)/
syscall () (:tmp:) /ls :*:text_cache:\/texts\/:*:fileid:/
match [:tmp:] /:*:fileid:/
# remember this file on a permanent basis
syscall /ln :*:text_cache:\/texts\/:*:fileid: :*:text_cache:\/known_spam\/:*:fileid: /
#output /DOING: crm mailtrainer.crm --spam=:*:text_cache:\/known_spam\/:*:fileid: --good=:*:text_cache:\/empty\/ / () (:mailtrainer_output:)
syscall /crm mailtrainer.crm --fileprefix :*:fileprefix: --spam=:*:text_cache:\/known_spam\/:*:fileid: --good=:*:text_cache:\/empty\/ / () (:mailtrainer_output:)
#output /mailtrainer output: ':*:mailtrainer_output:'\n/
# and remove it from the prob_* directories, as
# now it's known
syscall /rm -rf :*:text_cache:\/prob_spam\/:*:fileid:/
syscall /rm -rf :*:text_cache:\/prob_good\/:*:fileid:/
#
# now it's trained; put in a marker in the headers
call /:mungmail_delete:/ [X-CRM114-Status: ]
call /:mungmail_delete:/ [X-CRM114-Unsure: ]
call /:mungmail_add:/ [X-CRM114-Action: LEARNED AND CACHED SPAM]
# Insert the training report in the msgbody, if desired
{
match [:add_mailtrainer_report:] /yes/
match (:: :firstline:) /.*(.)/
match (:: :firstline:) /\n\n()/
alter (:firstline:) / :*:mailtrainer_output:\n ------ \n/
}
accept
exit /:*:accepted_mail_exit_code:/
}
alius
{
call /:mungmail_add:/ [X-CRM114-ERROR: No cached text with that cacheID, so nothing done! ]
accept
exit /:*:accepted_mail_exit_code:/
}
}
{
# Not cached...
#
# Verify that we need to learn this first (TOE strategy)
classify <:*:clf:> [:text:] /:*:lcr:/ \
(:*:fileprefix::*:spamcss: :*:fileprefix::*:nonspamcss: )\
(:classify_status:)
match [:classify_status:] <nomultiline> \
/^#0.* pR: ([-. 0-9]+)/ (:: :pr:)
eval /:@: :*:pr: < :*:thick_threshold: : /
#
# write out the pre-mutilation text, with newlines
#
output [:*:fileprefix:spamtext.txt] <append> /\n\n:*:cmd_txt: \n/
learn < :*:clf:> (:*:fileprefix::*:spamcss:) [:text:] /:*:lcr:/
call /:mungmail_add:/ [X-CRM114-Action: LEARNED SPAM]
call /:mungmail_unique:/ [X-CRM114-Status: Good (Spam Learn)]
accept
exit /:*:accepted_mail_exit_code:/
}
alius
{
# Did the user specify "--force" on the command line?
match [:force:] /SET/
#
# write out the pre-mutilation text, with newlines
#
output [:*:fileprefix:spamtext.txt] <append> /\n\n:*:cmd_txt: \n/
learn < :*:clf:> (:*:fileprefix::*:spamcss:) [:text:] /:*:lcr:/
call /:mungmail_add:/ [X-CRM114-Action: LEARNED SPAM (FORCED)]
call /:mungmail_unique:/ [X-CRM114-Status: Good (Spam Learn)]
accept
exit /:*:accepted_mail_exit_code:/
}
alius
{
call /:mungmail_add:/ [X-CRM114-Action: LEARN AS SPAM UNNECESSARY- ALREADY CLASSIFIED CORRECTLY - NO ACTION TAKEN]
call /:mungmail_unique:/ [X-CRM114-Status: Good (Spam Learn)]
accept
exit /:*:accepted_mail_exit_code:/
}
}
{
# was it a command to learn something as an arbitrary type?
# Note: the files this generates don't get used for anything unless
# you use --spamcss and --nonspamcss in your own scripts.
#
# Note: these "learns" are a-priori "force", since we don't know
# what other .css files we should compare this text to.
#
match [:c:] /learn/
match (:z: :learnfile: :text:) [:m_text:] /:*:_nl:command [[:graph:]]+ learn ([[:graph:]]+)(.*)/
# and learn it
:learntofilehere:
output [:*:fileprefix::*:learnfile:text.txt] <append> /:*:text:/
learn < :*:clf:> (:*:fileprefix::*:learnfile:.css) [:text:] /:*:lcr:/
# syscall (:*:_dw:) (:_dw:) /formail -A "X-CRM114-Action: LEARNED :*:file: :*:clf:"/
call /:mungmail_add:/ [X-CRM114-Action: LEARNED :*:file:]
call /:mungmail_unique:/ [X-CRM114-Status: Good (Learn)]
accept
exit /:*:accepted_mail_exit_code:/
}
}
#
#
# George's Circuit Breaker - if the mail has already been processed
# by CRM114, then send it directly to output, without further
# processing.
#
# WE DON'T RISK THIS ANY MORE - WITH ~ A MILLION USERS, WE'RE NOW
# A TARGET FOR SPAMMERS TO USE THIS HACK.
#{
# match /X-CRM114/
# alter (:classifier_reason:) \
# / This mail seems to have already been processed by CRM114. /
# alter (:stats:) / pR: 999.99 /
# goto /:looks_good:/
#}
# none of the above - classify this incoming mail instead.
# first according to priority action list,
# then according to whitelist,
# then according to blacklist,
# then according to the CRM sparse spectral classifier.
#
# check it against the priority action list- this list is
# of the form of a + or -, then a pattern. + means accept,
# - means reject. These are executed in order (which is
# different from whitelist or blacklist in that they occur
# in order given, not whitelist-then-blacklist. The priority
# action list is tried before whitelist or blacklist.
#
isolate (:priolist:)
input (:priolist:) [:*:fileprefix:priolist.mfp]
# reset matching on :priolist: to the start of the string
match [:priolist:] //
#
#
{
#... Grab the next regexturn the one-per-line patterns into a regex
match <fromend nomultiline> (:w: :pm: :pat:) [:priolist:] /(.)(.+)/
#... see if this regex matches the incoming mail
{
match <nomultiline> (:reason:) /:*:pat:/
# Yep, it matched... branch based on pm
#
{
match [:pm:] /[+]/
# put in a little tag saying why prio-listed
alter (:classifier_reason:) /** ACCEPT: CRM114 Priority Whitelisted by: :*:reason: **:*:_nl:/
alter (:stats:) / pR: 999.99 /
goto /:looks_good:/
}
# No, we didn't have a +, was it a '-'?
{
match [:pm:] /[-]/
alter (:classifier_reason:) /** REJECT: CRM114 Priority Blacklisted by: :*:reason: **:*:_nl:/
alter (:reject_address:) /:*:fail_priority_mail_to:/
{
match [:log_rejections:] /yes/
output [:*:fileprefix:rejected_by_blacklist.txt] <append> /:*:_dw:/
}
alter (:stats:) / pR: -999.99 /
goto /:looks_bad:/
}
}
# Nope, didn't match as a priority... grab the next regex
liaf
}
#
#
# check it against the whitelist... load the whitelist...
{
isolate (:whitelist:)
input (:whitelist:) [:*:fileprefix:whitelist.mfp]
# reset matching on whitelist to start of string
match [:whitelist:] //
}
#
#
{
#... Grab the next regexturn the one-per-line patterns into a regex
match <fromend nomultiline> (:waste: :whregex:) [:whitelist:] /(.+)/
#... see if this regex matches the incoming mail
{
match <nomultiline> (:reason:) /:*:whregex:/
# Yep, it matched... whitelist this email
#
# put in a little tag saying why whitelisted:
alter (:classifier_reason:) /** ACCEPT: CRM114 Whitelisted by: :*:reason: **:*:_nl:/
alter (:_dw:) /:*:_dw:\n\n ** CRM114 Whitelisted by: :*:reason: **:*:_nl:/
alter (:stats:) / pR: 999.99 /
goto /:looks_good:/
}
# Nope, didn't match... grab the next regex and try again,
liaf
}
#
# No joy, maybe we should blacklist it.
#
# check it against the blacklist
{
isolate (:blacklist:)
input (:blacklist:) [:*:fileprefix:blacklist.mfp]
# reset matching on blacklist to start of string
match [:blacklist:] //
}
#
{
#... Grab the next regexturn the one-per-line patterns into a regex
match <fromend nomultiline> (:waste: :blregex:) [:blacklist:] /(.+)/
#... see if this regex matches the incoming mail
{
match <nomultiline> (:reason:) /:*:blregex:/
# Yep, it matched... blacklist this email
#
# put in a little tag saying why blacklisted
alter (:classifier_reason:) /** REJECT: CRM114 Blacklisted by: :*:reason: ** :*:_nl:/
alter (:reject_address:) /:*:fail_blacklist_mail_to:/
{
match [:log_rejections:] /yes/
output [:*:fileprefix:rejected_by_blacklist.txt] <append> /:*:_dw:/
}
alter (:stats:) / pR: -999.99 /
goto /:looks_bad:/
}
# Nope, didn't match... grab the next regex and try again
liaf
}
#
#
#
# End of blacklist processing.
#
#
# All else has failed- we now run our CLASSIFY algorithm
# to make our best guess.
#
#
{
# Run the CSS classifier against the "expanded" text -
# if it classifies as SPAM
# then reject it as SPAM.
#
{
classify <:*:clf:> ( :*:fileprefix::*:nonspamcss: | :*:fileprefix::*:spamcss: ) ( :stats: ) [:m_text:] /:*:lcr:/
}
# Now we grab the pR and if it's greater than the minus
# threshold, we send it to "good". Otherwise, it goes to bad.
{
match <nomultiline> [:stats:] (:d: :pval:) /pR: (.*)/
eval /:@: :*:pval: > ( 0.0 - :*:thick_threshold: ) : /
alter (:classifier_reason:) /** ACCEPT: CRM114 PASS :*:clf: Matcher ** :*:_nl::*:stats:/
goto /:looks_good:/
}
alter (:classifier_reason:) /** REJECT: CRM114 FAIL :*:clf: Matcher ** :*:_nl::*:stats:/
alter (:reject_address:) /:*:fail_SSM_mail_to:/
{
match [:log_rejections:] /yes/
output [:*:fileprefix:rejected_by_css.txt] <append> /:*:_dw:/
}
goto /:looks_bad:/
}
#
#
# Final wrap-up routines - dispose of the mail as appropriate.
#
{
:looks_bad:
# is this a :stats_only: run (i.e. for CAMRAM)
{
match [:stats_only:] /SET/
match <nomultiline> [:stats:] (:d: :pval:) /pR: (.*)/
output /:*:pval: :*:_nl:/
alter (:our_exit_code:) /:*:rejected_mail_exit_code:/
goto /:finish_up:/
}
# not stats_only.... we're doing major output.
# save unprocessed text by symlink in the text cache if needed.
{
match [:text_cache:] /./
syscall () () /ln :*:text_cache:\/texts\/:*:cacheid: :*:text_cache:\/prob_spam\/:*:cacheid:/
}
# and write out the long-form message too.
{
{
match [:add_headers:] /yes/
{
match <nomultiline> [:stats:] (:pr:) /pR: .*$/
}
call /:mungmail_add:/ [X-CRM114-Version: :*:_crm_version: MF-:*:_pgm_hash: ]
call /:mungmail_unique:/ [X-CRM114-CacheID: :*:cacheid: ]
call /:mungmail_unique:/ [X-CRM114-Status: SPAM ( :*:pr: )]
}
#
# Now, get the Subject: line. If none, make one.
{
{
match (:subject_line: :subj_text:) <nocase nomultiline> \
/^Subject: (.*)/
}
alius
{
match (:end_of_headers:) /\n\n/
alter (:end_of_headers:) /\nSubject: (none)\n\n/
match (:subject_line: :subj_text:) <nomultiline> /^Subject: (.*)/ }
}
{
#
# If we are re-sending this, we want to de-fang the
# subject, otherwise we don't.
match [:reject_address:] /[a-zA-Z0-9]/
# Paolo P. suggests this alteration to avoid subversion
# by enclosing an alternate target in "marks". We always
# have to do this.
{
match (:dq:) [:subj_text:] /\$/
alter (:dq:) /USD/
liaf
}
{
match (:dq:) [:subj_text:] /[^-a-zA-Z0-9!., ]/
alter (:dq:) //
liaf
}
#
# We isolate subj_text here, so if later syscalls move
# things, the subject text used in "mail" is still OK.
isolate (:subj_text:)
}
#
# If the user asked for a spam-flagging string, put the flagging
# string into the subject.
#
{
match [:spam_flag_subject_string:] /./
alter (:subj_text:) \
/:*:spam_flag_subject_string: :*:subj_text:/
}
{
match [:add_extra_stuff:] /text/
# get rid of any first-column 'From's as they are message breaks!
# this isn't necessary if we're mailing to someplace else...
{
match (:f:) <nomultiline> [:m_text:] /^From/
alter (:f:) />:*:f:/
liaf
}
alter (:_dw:) /:*:_dw:-=-Extra Stuff-=-\n\n:*:m_text: -0-0-0- :*:_nl:/
}
{
match [:add_extra_stuff:] /attachment/
# get rid of any first-column 'From's as they are message breaks!
# this isn't necessary if we're mailing to someplace else...
{
match (:f:) <nomultiline> [:m_text:] /^From/
alter (:f:) / :*:f:/
liaf
}
isolate (:content_type:) //
# syscall (:*:_dw:) (:content_type:) /formail -X "Content-Type"/
call /:mungmail_extract:/ [Content-type] (:content_type:)
isolate (:content_transfer_encoding:) //
# syscall (:*:_dw:) (:content_transfer_encoding:) /formail -X "Content-Transfer-Encoding"/
call /:mungmail_extract:/ [Content-Transfer-Encoding] (:content_transfer_encoding:)
# syscall (:*:_dw:) (:_dw:) /formail -A "MIME-Version: 1.0"/
# syscall (:*:_dw:) (:_dw:) /formail -A "Content-Type: multipart\/mixed\; boundary=Attachment_Quote_Boundary_1234567890\n--Attachment_Quote_Boundary_1234567890\n:*:content_type::*:content_transfer_encoding:"/
call /:mungmail_add:/ ["Content-Type: multipart\/mixed\; boundary=Attachment_Quote_Boundary_1234567890\n--Attachment_Quote_Boundary_1234567890\n:*:content_type::*:content_transfer_encoding:]
alter (:_dw:) /:*:_dw::*:_nl:\
--Attachment_Quote_Boundary_1234567890 :*:_nl:\
Content-Type: text\/plain :*:_nl:\
Content-Transfer-Encoding: quoted-printable \n\n\n:*:m_text:\
\n--Attachment_Quote_Boundary_1234567890--\n/
}
#
#
# Decide if we forward or if we just output it.
{
{
# if this match succeeds, we should forward-to-an-address?
# Yes, but only if we _have_ a forward-to address.
match [:reject_address:] /[a-zA-Z0-9]/
{
# -- put the classifier reason in as the first thing!
match [:add_verbose_stats:] /yes/
alter (:_dw:) /:*:_nl: :*:classifier_reason::*:_nl: :*:_dw: /
}
syscall (:*:_dw:) /mail :*:reject_address: -s ':*:subj_text:'/
}
alius
{
{
# -- put the classifier reason in at the end of the headers
match [:add_verbose_stats:] /yes/
match (:start_of_data:) /\n\n/
alter (:start_of_data:) /\n\n :*:classifier_reason: \n /
}
accept
}
}
}
alter (:our_exit_code:) /:*:rejected_mail_exit_code:/
goto /:finish_up:/
}
#
# and here's where we accept something as good email.
{
:looks_good:
# is this a :stats_only: run (i.e. for CAMRAM)
{
match [:stats_only:] /SET/
match <nomultiline> [:stats:] (:d: :pval:) /pR: (.*)/
output /:*:pval: :*:_nl:/
alter (:our_exit_code:) /:*:accepted_mail_exit_code:/
goto /:finish_up:/
}
# Not stats-only; do the full output thing.
# save unprocessed txt by symlink in the text cache if needed.
{
match [:text_cache:] /./
syscall () () /ln :*:text_cache:\/texts\/:*:cacheid: :*:text_cache:\/prob_good\/:*:cacheid:/
}
# and generate up a pretty mail-out report.
{
match [:add_verbose_stats:] /yes/
alter (:_dw:) /:*:_dw: :*:_nl: :*:classifier_reason: :*:_nl:/
}
{
match [:add_headers:] /yes/
{
match <nomultiline> [:stats:] (:pr:) /pR: .*$/
}
# syscall (:*:_dw:) (:_dw:) /formail -A "X-CRM114-Version: :*:_crm_version: MF-:*:_pgm_hash: " -A "X-CRM114-Status: Good ( :*:pr: \)"/
call /:mungmail_add:/ [X-CRM114-Version: :*:_crm_version: MF-:*:_pgm_hash: [:*:pr:]]
call /:mungmail_unique:/ [X-CRM114-CacheID: :*:cacheid: ]
call /:mungmail_unique:/ [X-CRM114-Status: Good ( :*:pr: )]
{
# Maybe we need to tag it as unsure? Note that since mail
# that scores > -thresh (but still < 0)goes out the "good" pipe,
# some "spammy" email might come through here.
match <nomultiline> [:stats:] (:d: :pval:) /pR: (.*)/
eval /:@: :*:pval: < :*:thick_threshold: :/
call /:mungmail_unique:/ [X-CRM114-Status: UNSURE (:*:pval:) This message is 'unsure'; please train it! ]
}
}
{
match [:add_extra_stuff:] /text/
# get rid of any first-column 'From's as they are message breaks!
# this isn't necessary if we're mailing to someplace else...
{
match (:f:) <nomultiline> [:m_text:] /^From/
alter (:f:) / :*:f:/
liaf
}
alter (:_dw:) /:*:_dw:-=-Extra Stuff-=-\n\n :*:m_text: -0-0-0- \n/
}
{
match [:add_extra_stuff:] /attachment/
# get rid of any first-column 'From's as they are message breaks! # this isn't necessary if we're mailing to someplace else...
{
match (:f:) <nomultiline> [:m_text:] /^From/
alter (:f:) / :*:f:/
liaf
}
isolate (:content_type:) //
# syscall (:*:_dw:) (:content_type:) /formail -X "Content-Type"/
call /:mungmail_extract:/ (:content_type:) [Content-Type]
isolate (:content_transfer_encoding:) //
# syscall (:*:_dw:) (:content_transfer_encoding:) /formail -X "Content-Transfer-Encoding"/
call /:mungmail_extract:/ (:content_transfer_encoding:) [Content-Transfer-Encoding:]
# syscall (:*:_dw:) (:_dw:) /formail -A "MIME-Version: 1.0"/
call /:mungmail_add:/ [MIME-Version: 1.0]
# syscall (:*:_dw:) (:_dw:) /formail -A "Content-Type: multipart\/mixed\; boundary=Attachment_Quote_Boundary_1234567890\n--Attachment_Quote_Boundary_1234567890\n:*:content_type::*:content_transfer_encoding:"/
# syscall (:*:_dw:) (:_dw:) /formail -U "Content-Type"/
call /:mungmail_unique:/ [Content-Type:]
alter (:_dw:) /:*:_dw::*:_nl:\
--Attachment_Quote_Boundary_1234567890 :*:_nl:\
Content-Type: text\/plain :*:_nl:\
Content-Transfer-Encoding: quoted-printable \n\n\n:*:m_text:\
\n--Attachment_Quote_Boundary_1234567890--\n/
}
accept
alter (:our_exit_code:) /:*:accepted_mail_exit_code:/
goto /:finish_up:/
}
#
# Here's where we finish up processing in all the paths.
#
:finish_up:
{
# ---- should we consider automatic training?
match [:automatic_training:] /yes/
# bounce out if we've already auto-trained this email
match <absent> /AUTO-TRAINED/
isolate (:msghash:)
hash (:msghash:) /:*:_dw:/
# pick one in 16- here, if the second-to-last digit is a 0
match [:msghash:] /......0./
#
# out put autotraining...
# Yep... we should use this for autotraining
# do we auto-train on acceptance?
{
match [:classifier_reason:] /ACCEPT/
# it wasn't spam... autotrain it "nonspam"
output [:*:fileprefix:nonspamtext.txt] <append> /:*:text:/
learn <microgroom> (:*:fileprefix::*:nonspamcss:) [:m_text:] /:*:lcr:/
goto /:autotrain_finish:/
}
# or do we autotran on rejection
{
match [:classifier_reason:] /REJECT/
# it was spam... autotrain it "spam"
output [:*:fileprefix:spamtext.txt] <append> /:*:text:/
learn <microgroom> (:*:fileprefix::*:spamcss:) [:m_text:] /:*:lcr:/
goto /:autotrain_finish:/
}
:autotrain_finish:
{
{
match [:autotrain_address:] /../
# syscall (:*:classifier_reason: :*:_nl: :*:_dw:) /mail -s "AUTO-TRAINED email - please check" :*:autotrain_address:/
}
alius
{
# there was no autotrain address, so we just accept it.
match (:subj:) /Subject:/
alter (:subj:) /Subject: AUTO_TRAINED- please check! .../
accept
}
}
}
:exit_here:
exit /:*:our_exit_code:/
###################################################
#
#
# This is Mungmail - these are the replacement routines for
# formail(), to remove dependency on formail() being in every
# distribution
#
#
# Add a new header
:mungmail_add: (:new_header:)
{
# Grab the current headers
call /:mungmail_grab_current_headers:/
alter (:current_headers:) /:*:current_headers::*:new_header:\n/
return
}
#
# extract a header (first of them found)
#
:mungmail_extract: (:header_name:)
{
# Extract the header with the given field type, and
# return that. Note that we add the colon here; don't
# put it into the desired_header string.
#
call /:mungmail_grab_current_headers:/
{
match [:current_headers:] <nocase> (:: :desired_header:) \
/(?:^|\n)(:*:header_name: *: ([^\n]|\n[[:space:]])*)/
return /:*:desired_header:/
}
return //
}
#
# delete all current headers of this type, insert ours instead.
#
:mungmail_delete: (:new_header:)
{
call /:mungmail_grab_current_headers:/
{
match (:new_header_type:) [:new_header:] /[[:graph:]]+/
}
#
# a LIAF-loop to delete any header (including continuations) that
# has a type that matches the new_header_type.
{
match [:current_headers:] (:kill_this_line:) \
/:*:new_header_type: ([^\n]|\n[[:space:]])*\n/
alter (:kill_this_line:) //
liaf
}
return
}
# delete all current headers of this type, insert ours instead.
#
:mungmail_unique: (:new_header:)
{
call /:mungmail_grab_current_headers:/
{
match (:new_header_type:) [:new_header:] /[[:graph:]]+/
}
call /:mungmail_delete:/ [:*:new_header:]
call /:mungmail_add:/ [:*:new_header:]
return
}
#
# Helper routine to get the current mail headers of :_dw:
#
:mungmail_grab_current_headers:
{
{
# Grab everything before the first \n\n
match (:: :current_headers:) /(([^\n]+\n)+)\n/
# output /-A-->:*:current_headers:<---\n/
return
}
# if we got here, it wasn't a real message (void body, and/or no
# doubled newline) but it might still have useful text anyway.
# Is there a final newline?
{
match (:current_headers:) /^.*\n$/
# output /-B-->:*:current_headers:<---\n/
return
}
# if we got to here, then there wasn't even a final newline.
# That's a violation of RFC, we'll add it.
{
alter (:_dw:) /:*:_dw:\n/
match (:current_headers:) /.+/
# output /-C-->:*:current_headers:<---\n/
return
}
fault / Couldn't manage to find the headers, though I tried really hard\n/
}
trap (:broken_program_message:) /.*/
{
accept
output /:*:_nl: Aw, crud. mailfilter.crm broke. Here's the error: :*:_nl:/
output /:*:broken_program_message:/
output [stderr] /:*:_nl: ERROR: mailfilter.crm broke. Here's the error: :*:_nl:/
output [stderr] /ERROR: :*:broken_program_message:/
}
exit /:*:program_fault_exit_code:/
File Manager Version 1.0, Coded By Lucas
Email: hehe@yahoo.com