File Manager

Current File : //usr/share/crm114/mailreaver.crm
#! /usr/bin/crm
# -(spam good cache dontstore stats_only outbound undo verbose maxprio minprio delprio fileprefix config)
#
#	mailreaver.crm - 3rd Generation mail filter "standard script"
#
#   Note to SunOS and FreeBSD users - do not place command arguments of
#   "-([arguments])" format on the first line of this program
#   or you will not get  what you expect. This is due to a kernel
#   difference in how a bangline should be dealt with.

# Copyright 2002-2009 William S. Yerazunis.
# This file is under GPLv3, as described in COPYING.

#     This is MailReaver, the 3rd Generation mail filter "standard script"
#     for CRM114.  The goal is to make a more maintainable, understandable,
#     and easier-to-customize mail filter.
#
#    1) we use the consolidated library "maillib.crm" for most shareable
#       things like parsing the .cf file, munging text, cacheing text, etc.
#
#    2) we always use the CacheIDs and the Reaver (maildir-like) format
#       for storing incoming email in unaltered form if there is
#       any possibility of training it.
#
#    3) We always train using mailtrainer.crm rather than training
#       internally.  Thus, if you want to change the way things are
#       trained, you need to look at mailtrainer.crm as well.
#
#
#     We support the following commands, on both the command line
#     in the form "--blahblah --blahblah"  and
#     by using the "command {secretpassword} word word" method
#
#       spam		- this is spam.  Treat appropriately.[*]
#       good		- this is good.  Treat appropriately.[*]
#       cache           - pretend the email starts on the following line.
#                         process it normally and put it in the cache.
#       dontstore         - do NOT store this text into the cache.  Use
#                         this for text that you never want to train.
#	stats_only	- output ONLY the status.  Nothing else.
#			Since this means there's no output of the
#                       CacheIDs, we don't store the text for later training.
#       report_only     - output ONLY the headers that would have
#                       been added, not the entire text.
#	outbound	- this is an "outbound" message and hence can
#			assumed to be nonspam.  Train if necessary.
#	undo		- the message was mis-trained as type X.  Undo that.
#	verbose		- tell me more.
#	maxprio +/-	- add a maximum priority entry
#       minprio +/-	- add a minimum priority entry
#       delprio         - delete a prio list entry.
#       fileprefix=dir  - look in the dir for the .css and .cf files
#       config=filename - use filename as the .cf file
#
#     [*]  - meaning that if it's a command line flag, the entire
#     standard input is the example text and if it's an inline
#     command, then the example text follows the inline commandline.
#
#     Overall Design:
#
#	1) Read in the parameter file
#
#       2) Check for commands.  Set flags as appropriate
#
#	3) Are any command flags set?
#
#          3a) run those commands
#
#          3b) report the results and exit
#
#        ... otherwise...
#
#	4) Run the priolist
#
#	5) Run the classifier
#
#	6) Dispatch on the result
#
##############################################################
#
#        Step 1 - Read in the parameter file
#
#############################################################
#
#    ---  uncomment this if you want to include a "forced"
#         configuration file  ---
#
# insert mailfilterconfig.crm
#
#
#    --- These vars must have a value, or else we'll get errors ----
#
isolate <default> (:fileprefix:) //
#
isolate (:classifier_reason:) /no reason yet/
#
isolate (:classify_status:) //
#
isolate (:our_exit_code:) /0/
#
isolate (:stats:) / pR: 0.000000 /
#
isolate (:pr:) / pR: 0.00000/
#
isolate (:subj_text:) / (None) /
#
isolate (:add_extra_stuff:) //
#
isolate (:decision_length:) /8192/
#
#      Isolate these email addresses, and give them values,
#      in case the user doesn't.
isolate (:reject_address:) //
isolate (:unsure_address:) //
isolate (:fail_priority_mail_to:) //
isolate (:fail_classify_mail_to:) //
isolate (:fail_blacklist_mail_to:) //
isolate (:fail_SSM_mail_to:)  //
isolate (:log_rejections:) //
isolate <default> (:report_only:) //
#
#
#        load the config file
isolate <default> (:config:) /:*:fileprefix:mailfilter.cf/
call /:load_cf_file:/ [:*:config:]

#######################################################################
#
#    Do a quick check- has the password been changed or not?  If it's
#    still the default, put in something that will be well-nigh unguessable
#    (esp. since it will contain recieved headers that the sender cannot
#    see nor control.)
{
    match [:spw:] /DEFAULT_PASSWORD/
    #  yes, it's the same as default.  So we scramble it just so
    #  nobody can hack in without major trickery.
    hash (:spw:) /:*:_env_string::*:_dw:/
}

######################################################################
#
#     if a particular "fail" category hasn't been assigned, but
#     the :general_fails_to: category has, then send there instead
{
    match [:fail_priority_mail_to:] /./ <absent>
    isolate (:fail_priority_mail_to:) /:*:general_fails_to:/
}
{
    match [:fail_classify_mail_to:] /./ <absent>
    isolate (:fail_classify_mail_to:) /:*:general_fails_to:/
}


#########################################################################
#
#       START OF ACTUAL MAIL PROCESSING HERE
#
#########################################################################
#
#      Does the user want us to log all incoming mail?  This is handy for
#      testing and auditing purposes; by default it is turned off in the
#      configuration file.
{
    match [:log_to_allmail.txt:] /yes/
    output [:*:fileprefix:allmail.txt] <append> /:*:mail_separator:/
    output [:*:fileprefix:allmail.txt] <append> /:*:_dw:/
}

#     allow logging to anywhere...
{
    match [:log_all_mail_to_file:] /./
    output [:*:fileprefix::*:log_all_mail_to_file:] <append> /:*:mail_separator:/
    output [:*:fileprefix::*:log_all_mail_to_file:] <append> /:*:_dw:/
}


########################################################################
#
#       2) Check for a command.  Set flags as appropriate.  Note that we
#       can't just dispatch on the presence of a flag, because we need
#	to merge in anything that might be in an inline command.
#
########################################################################
#
#      Our commands are of the form:
#
#      command password <command>  extra_args
#
#      and the extra_args are one or more of
#      	spam
#       good
#	cache
#       dontstore
#	stats_only
#	outbound
#	undo
#	verbose
#
#
#    Start our searching at the start of our input.
#        If nothing else requires it, we use the current input as the
#        text to be operated upon.
{
    isolate (:spam: :good: :cache: :dontstore: :stats_only: )
    isolate (:outbound: :undo: :verbose:)
    isolate (:in_text:)  /:*:_dw:/
    #   now find the command, and set the cmdline insert point.
    match (:: :cmdline:) /.*(.)/
    match (:cmdline:) /\n\n/
    match <nomultiline> (:cmdline: :cmds:) /^command :*:spw: (.*)$/
    {
	#    Yep, found a command.  Grab remaining text in case
	#     we aren't using the cached version
	match <fromend> (:in_text:) /.*/
	#
	#      Parse out the command (and in the case of the prio lists,
	#      actually do the work)
	{
	    #      Command to learn spam
	    isolate (:spam:) <default> //
	    match [:cmds:] /spam/
	    alter (:spam:) /SET/
	}
	{
	    #      Command to learn good
	    isolate (:good:) <default> //
	    match [:cmds:] /good/
	    alter (:good:) /SET/
	}
	{
	    #      Command to use the cached text copy
	    isolate (:cache:) <default> //
	    match [:cmds:] /cache/
	    alter (:cache:) /SET/
	}
	{
	    #      Command to NOT store this in the cache for later
	    isolate (:dontstore:) <default> //
	    match [:cmds:] /dontstore/
	    alter (:dontstore:) /SET/
	}
	{
	    #      Command to run stats_only, which implies dont store.
	    isolate (:stats_only:) <default> //
	    match [:cmds:] /stats_only/
	    alter (:stats_only:) /SET/
	}
	{
	    #      Command to run outbound mode
	    isolate (:outbound:) <default> //
	    match [:cmds:] /outbound/
	    alter (:outbound:) /SET/
	}
	{
	    #      Command to UNdo
	    isolate (:undo:) <default> //
	    match [:cmds:] /undo/
	    alter (:undo:) /SET/
	}
	#
	#     The following commands can only be used inline, not
	#     on the command line, and they do NOT use the text.
	{
	    #      Command to set a maxprio entry
	    match [:cmds:] \
		    /maxprio ([-+][[:graph:]]+) / \
		    (:: :prio_regex:)
	    input [:*:fileprefix:priolist.mfp] (:priotext:)
	    alter (:priotext:) \
		    /:*:prio_regex:\n:*:priotext:/
	    output [:*:fileprefix:priolist.mfp] /:*:priotext:/
	    call /:mungmail_add:/ [X-CRM114-Success: Added new highest priority entry ":*:prio_regex:" ]
	    accept
	    exit
	}
	{
	    #      Command to set a minprio entry
	    match [:cmds:] \
		    /minprio ([-+][[:graph:]]+)/ \
		    (:: :prio_regex:)
	    input [:*:fileprefix:priolist.mfp] (:priotext:)
	    alter (:priotext:) \
		    /:*:priotext:\n:*:prio_regex:/
	    output [:*:fileprefix:priolist.mfp] /:*:priotext:/
	    call /:mungmail_add:/ [X-CRM114-Success: Added new lowest priority entry ":*:prio_regex:" ]
	    accept
	    exit
	}
	{
	    #      Command to delete a priolist entry
	    match [:cmds:] \
		    /delprio ([[:graph:]]+)/ \
		    (:: :prio_regex:)
	    input [:*:fileprefix:priolist.mfp] (:priotext:)
	    match [:priotext:] <nomultiline> \
		    /^.*:*:prio_regex:.*$/ (:die:)
	    alter (:die:) /\n/
	    output [:*:fileprefix:priolist.mfp] /:*:priotext:/
	    call /:mungmail_add:/ [X-CRM114-Success: Deleted priority entry ":*:prio_regex:" ]
	    accept
	    exit
	}
    }
}

#######  Inter-flag dependencies fixed up here.

{
    match [:stats_only:] /SET/
    alter (:dontstore:) /SET/
}



###################################################################
#
#       See if there's already a CacheID in the headers- if so, grab it.
#
isolate (:cacheid:) //
{
    {
	match [:in_text:] <nomultiline> (:: :cacheid:)  \
		/X-CRM114-CacheID:.*sfid-([[:graph:]]+)/
	isolate (:cacheid:)
	isolate (:long_cacheid:) /:*:text_cache:\/texts\/:*:cacheid:/
    }
    alius    #  nope, not in the explicit headers.  Check for it as a
    #           comment in the Message-Id: header.
    {
	match [:in_text:] <nomultiline> (:: :cacheid:)  \
		/Message-Id:.*\(sfid-([[:graph:]]+)\)/
	isolate (:cacheid:)
	isolate (:long_cacheid:) /:*:text_cache:\/texts\/:*:cacheid:/
    }
    #      ADD OTHER PLACES TO CHECK HERE
}

####################################################################
#
#      Do we save the text into the cache, or has that already happened
#
# Unless "dontstore" is set or we already have a CacheID, we're supposed
# to save the text.  Saving is the usual case, mind you.  Normally this
# is the whole text, but if we had an inline "command cache" line then
# use whatever text follows the "command password cache" line and stuff
# that into the cache rather than the whole text.  You don't normally
# need to do this except as a prelude to training this new text.
#
{
    match <absent> [:cacheid:] /./
    match <absent> [:dontstore:] /SET/
    #      yes - so the text to be worked follows the command line.
    #      which is already in :in_text:   This also calculates the
    #      new cacheids named :cacheid: and :long_cacheid: .
    call /:reavercache_init:/
    call /:reavercache_store:/ [:*:_dw:]
}

#####################################################################
#####################################################################
#
#	Command flags are all set; at this point we can run strictly from
#	the :var: values and the text held in :in_text:
#
#####################################################################
#####################################################################


#
#       We still need to cope with the following possibilities:
#        learn as spam, learn as nonspam, classify, and stats_only.
#
#       But at least now we can run the preprocessing on :in_text:
#
#output /\nPREPROCESS INPUT:\n :*:in_text:\n/
{
    #  note - this work gets thrown away if we are training from cache
    match (:chopped:) [:in_text: 0 :*:decision_length:] /.*/
    alter (:in_text:) /:*:chopped:/
    call /:mail_preprocess:/ [:*:in_text:] (:in_text:)
}
#output /\nPREPROCESS RESULT:\n :*:in_text:\n/

#
#    Are we supposed to learn this as spam?
{
    match [:spam:] /SET/
    {
	isolate (:tmp:) //
	syscall () (:tmp:) /\/bin\/ls :*:long_cacheid:/
	match [:tmp:] /:*:cacheid:/
	#
	#      remember this file on a permanent basis by linking it into
	#      the known-spam directory.
	syscall /:*:cache_dupe_command: :*:text_cache:\/texts\/:*:cacheid: :*:text_cache:\/known_spam\/:*:cacheid: /
	#
	#      Now run mailtrainer.crm on the new copy
	isolate (:mailtrainer_output:)
	syscall /:*:fileprefix::*:trainer_invoke_command: --fileprefix=:*:fileprefix: --spam=:*:text_cache:\/known_spam\/:*:cacheid: --good=:*:text_cache:\/empty\/ / () (:mailtrainer_output:)
	#output /mailtrainer output:  ':*:mailtrainer_output:'\n/
	#      and remove it from the prob_* directories, as
	#      now it's known
	syscall /\/bin\/rm -rf  :*:text_cache:\/prob_spam\/:*:cacheid:/
	syscall /\/bin\/rm -rf  :*:text_cache:\/prob_good\/:*:cacheid:/
	#
	#      now it's trained; put in a marker in the headers
	call /:mungmail_delete:/ [X-CRM114-Status: ]
	call /:mungmail_delete:/ [X-CRM114-Unsure: ]
	call /:mungmail_add:/ [X-CRM114-Action: LEARNED AND CACHED SPAM]
	#   Insert the training report in the msgbody, if desired
	{
	    match [:add_mailtrainer_report:] /yes/
	    match (:: :firstline:) /.*(.)/
	    match (:: :firstline:) /\n\n()/
	    alter (:firstline:) / -------\n :*:mailtrainer_output:\n ------ \n/
	}
	{
	    match [:confirm_flag_subject_string:] /./
	    call /:mungmail_mung_subject:/ [:*:confirm_flag_subject_string:]
	}
	accept
	exit /:*:accepted_mail_exit_code:/
    }
    alius
    {
	#  GROT GROT GROT  We should make a better attempt at finding
	#  the file, like looking in known_spam and known_good.
	#
	match (:: :firstline:) /.*(.)/
	match (:: :firstline:) /\n\n()/
	alter (:firstline:) /\n\n-----\n  Problem: couldn't find the cached text.\n  Perhaps you've already trained it?  \n No action taken.\n\n/
	accept
	exit /:*:accepted_mail_exit_code:/
    }
}
#
#    Are we supposed to learn this as good?
{
    match [:good:] /SET/
    {
	isolate (:tmp:) //
	syscall () (:tmp:) /\/bin\/ls :*:long_cacheid:/
	match [:tmp:] /:*:cacheid:/
	#
	#      remember this file on a permanent basis by linking it into
	#      the known-good directory.
	syscall /:*:cache_dupe_command: :*:text_cache:\/texts\/:*:cacheid: :*:text_cache:\/known_good\/:*:cacheid: /
	#
	#      Now run mailtrainer.crm on the new copy
	isolate (:mailtrainer_output:)
	syscall /:*:fileprefix::*:trainer_invoke_command: --fileprefix=:*:fileprefix: --good=:*:text_cache:\/known_good\/:*:cacheid: --spam=:*:text_cache:\/empty\/ / () (:mailtrainer_output:)
	#output /mailtrainer output:  ':*:mailtrainer_output:'\n/
	#      and remove it from the prob_* directories, as
	#      now it's known
	syscall /\/bin\/rm -rf  :*:text_cache:\/prob_spam\/:*:cacheid:/
	syscall /\/bin\/rm -rf  :*:text_cache:\/prob_good\/:*:cacheid:/
	#
	#      now it's trained; put in a marker in the headers
	call /:mungmail_delete:/ [X-CRM114-Status: ]
	call /:mungmail_delete:/ [X-CRM114-Unsure: ]
	call /:mungmail_add:/ [X-CRM114-Action: LEARNED AND CACHED GOOD]
	#   Insert the training report in the msgbody, if desired
	{
	    match [:add_mailtrainer_report:] /yes/
	    match (:: :firstline:) /.*(.)/
	    match (:: :firstline:) /\n\n()/
	    alter (:firstline:) / -------\n :*:mailtrainer_output:\n ------ \n/
	}
	{
	    match [:confirm_flag_subject_string:] /./
	    call /:mungmail_mung_subject:/ [:*:confirm_flag_subject_string:]
	}
	accept
	exit /:*:accepted_mail_exit_code:/
    }
    alius
    {
	#  GROT GROT GROT  We should make a better attempt at finding
	#  the file, like looking in known_spam and known_good.
	#
	match (:: :firstline:) /.*(.)/
	match (:: :firstline:) /\n\n()/
	alter (:firstline:) /\n\n-----\n  Problem: couldn't find the cached text.\n  Perhaps you've already trained it?  \n No action taken.\n\n/
	accept
	exit /:*:accepted_mail_exit_code:/
    }
}

##########################################################################
#
#      Not a learn, so it's a CLASSIFY job.  Maybe full, maybe stats_only.
#      We try the Priolist first (eventually) then fall back on the
#      classifiers.   Priolist patterns start in column 1, and are a + or
#      a - immediately followed by a regex.
#
isolate (:priolist:)
input (:priolist:) [:*:fileprefix:priolist.mfp]
#
#
{
    #... Grab the next regexturn the one-per-line patterns into a regex
    match <fromend nomultiline> (:w: :pm: :pat:) [:priolist:]  /(.)(.+)/
    #... see if this regex matches the incoming mail
    {
	match <nomultiline> (:reason:) /:*:pat:/
	#  Yep, it matched... branch based on pm
	#
	{
	    match [:pm:] /[+]/
	    # put in a little tag saying why prio-listed
	    alter (:classifier_reason:) /** ACCEPT: CRM114 Priority Whitelisted by: :*:reason: **:*:_nl:/
	    alter (:stats:) /Match to priority pattern :*:pat:\n Forced pR: 999.99 /
	    match [:stats:] (:: :pr:) /Forced pR: ([[:graph:]]+)/
	    goto /:looks_good:/
	}
	#   No, we didn't have a +, was it a '-'?
	{
	    match [:pm:] /[-]/
	    alter (:classifier_reason:) /** REJECT: CRM114 Priority Blacklisted by: :*:reason: **:*:_nl:/
	    alter (:reject_address:) /:*:fail_priority_mail_to:/
	    {
		output [:*:fileprefix:rejected_by_blacklist.txt] <append> /:*:_dw:/
	    }
	    alter (:stats:) /Match to priority pattern :*:pat:\n Forced pR: -999.99 /
	    match [:stats:] (:: :pr:) /pR: ([[:graph:]]+)/
	    goto /:looks_spam:/
	}
    }
    #   Nope, didn't match as a priority... grab the next regex until
    #   there are no prio-list regexes left
    liaf
}

#######################################################################
#
#      No joy on the priolist.  Use the Classifier.
#
{
    isolate (:stats:) //
    classify [:in_text:] <:*:clf:> /:*:lcr:/ \
	    (:*:fileprefix:nonspam.css :*:fileprefix:spam.css) (:stats:)
    match [:stats:] <nomultiline> \
	    /^#0.* pR: ([-. 0-9]+)/ (:: :pr:)
    {
	#    Is this a stats_only run?
	#
	match [:stats_only:] /SET/
	output /:*:pr:/
	exit /:*:accepted_mail_return_code:/
    }
    #
    #      It's a pure classify.  Three possibilities:
    {
	#   Case 1 - It's spam.
	eval /:@: :*:pr: <= :*:spam_threshold: :/
	goto /:looks_spam:/
    }
    {
	#   Case 2 - It's good.
	eval /:@: :*:pr: >= :*:good_threshold: :/
	goto /:looks_good:/
    }
    {
	#   Case 3 - Unsure
	goto /:looks_unsure:/
    }
}
##################################################################
#
#      Final Dispatch - one of :looks_spam:, :looks_good:, or :looks_unsure:
#      will be gone-to.
#
#################################################################

:looks_spam:
{
    #     Do we log rejections somewhere?
    {
	match [:log_rejections_to_file:] /./
	output [:*:fileprefix::*:log_rejections_to_file:] <append> /:*:mail_separator:/
	output [:*:fileprefix::*:log_rejections_to_file:] <append> /:*:_dw:/
    }

    #     Do we put prob_spams into the prob_spam directory?
    {
	match [:text_cache:] /./
	match [:cacheid:] /./
	syscall /:*:cache_dupe_command: :*:text_cache:\/texts\/:*:cacheid: :*:text_cache:\/prob_spam\/:*:cacheid: /
    }

    #     flag the subject line
    {
	match [:spam_flag_subject_string:] /./
	call /:mungmail_mung_subject:/ [:*:spam_flag_subject_string:]
    }
    isolate (:subj_text:)
    alter (:our_exit_code:) /:*:rejected_mail_exit_code:/
    alter (:reject_address:) /:*:fail_classify_mail_to:/
    {
	match [:add_headers:] /yes/
	call /:mungmail_unique:/ \
		[X-CRM114-Version: :*:_crm_version: MR-:*:_pgm_hash: ]
	call /:mungmail_add_cache_info:/ [:*:cacheid:]
	call /:mungmail_unique:/ [X-CRM114-Status: SPAM  ( :*:pr: )]
	call /:mungmail_delete:/ [X-CRM114-Notice: ]
    }
    #
    #    Since sending mail needs complicated args, we do it here rather
    #    than in a mungmail routine - and if we send mail, we exit here
    #    rather than in the usual finish-up routine.
    {
	match [:reject_address:] /.../
        syscall (:*:_dw:) /mail :*:reject_address: -s ' :*:subj_text: '/
	exit /:*:our_exit_code:/
    }
}
goto /:finish_up:/

:looks_good:
{
    #     Do we put prob_good mail into the prob_good directory?
    {
	match [:text_cache:] /./
	match [:cacheid:] /./
	syscall /:*:cache_dupe_command: :*:text_cache:\/texts\/:*:cacheid: :*:text_cache:\/prob_good\/:*:cacheid: /
    }


    alter (:our_exit_code:) /:*:accepted_mail_exit_code:/
    {
	match [:good_flag_subject_string:] /./
	call /:mungmail_mung_subject:/ [:*:good_flag_subject_string:]
    }
    {
	match [:add_headers:] /yes/
	call /:mungmail_delete:/ [X-CRM114-Notice: ]
	call /:mungmail_unique:/ \
		[X-CRM114-Version: :*:_crm_version: MR-:*:_pgm_hash: ]
	call /:mungmail_add_cache_info:/ [:*:cacheid:]
	call /:mungmail_unique:/ [X-CRM114-Status: GOOD ( :*:pr: )]
    }
}
goto /:finish_up:/

:looks_unsure:
{
    alter (:our_exit_code:) /:*:unsure_mail_exit_code:/
    {
	match [:unsure_flag_subject_string:] /./
	call /:mungmail_mung_subject:/ [:*:unsure_flag_subject_string:]
    }
    {
	match [:add_headers:] /yes/
	call /:mungmail_unique:/ \
		[X-CRM114-Version: :*:_crm_version: MR-:*:_pgm_hash: ]
	call /:mungmail_add_cache_info:/ [:*:cacheid:]
	call /:mungmail_unique:/ [X-CRM114-Status: UNSURE ( :*:pr: )]
	call /:mungmail_unique:/ [X-CRM114-Notice: Please train this message. ]
    }
}
goto /:finish_up:/


###############################################################
#
#         Finish up - common exit routine

:finish_up:
{
    {
	#   If :report_only: is SET then delete everything that's
	#    not a X-CRM114 header
	match [:report_only:] /SET/
	match [:_dw:] //
	{
		match [:_dw:] <fromend> /.*?\n/ (:z:)
                {
		      match [:z:] <absent> /^X-CRM114/
		      # output /Deleting :*:z:\n/
                      alter (:z:) //
		}
		liaf
        }
    }
    accept
}
exit /:*:our_exit_code:/

################################################################
#
#        Catch failures.

trap (:broken_program_message:) /.*/
{
        accept
        output /:*:_nl: Aw, crud.  mailreaver.crm broke.  Here's the error: :*:\_nl:/
        output /:*:broken_program_message:/
        output [stderr] /:*:_nl: ERROR: mailreaver.crm broke.  Here's the error\: :*:_nl:/
        output [stderr] /ERROR: :*:broken_program_message:/
    }
exit /:*:program_fault_exit_code:/


######################################################################3
#
#              Library insertions start here.
insert /usr/share/crm114/maillib.crm
File Manager Version 1.0, Coded By Lucas
Email: hehe@yahoo.com