# Template file and instructions
#
# Creating a rules file will require consistent python syntax, but
# shouldn't require much if any programming knowledge beyond knowledge
# of HTML and regular expressions. Hopefully advanced users can
# contribute new import sites relatively easily.
#
# The instructions are here in comments. Below is a template for a
# dummy website You're welcome to jump down to look at the sample
# first and then come back and read these detailed explanations as
# necessary.
#
# You can also see many other examples in the importers/html_plugins/
# directory (either in src/lib/ of your distribution folder or in
# /usr/lib/python2.4/site-packages/gourmet/).
#
# To make this rules file be loaded, rename it with a .py
# extension. For clarity, it helps to name the file for its website.
#
# This template should help you create a new rules file for your own
# webpage import. If you get a rules file, please consider
# contributing it to future versions of Gourmet by attaching the file
# as a "patch" at our sourceforge site:
# https://sourceforge.net/tracker/?atid=649654&group_id=108118&func=browse
#
#
# A basic template contains three variables (one of them optional):
# SUPPORTED_URLS (list of canonical name(s) of target websites),
# SUPPORTED_URL_REGEXPS (optional list of regular expressions that
# match URLs for which we want to use this filter), and RULES (list of
# rules for scraping the website -- the bulk of the work).
#
#
# SUPPORTED_URLS = ['www.mywebsite.com','www.mywebsitesmirror.com']
# SUPPORTED_URLS_REGEXPS = ['.*\.mywebsite\.com']
# 
# RULES = [
#     ['attribute',
#      [PATH_TO_TAG],
#      'text', 'markup' OR 'attname'
#      post_processing # Either a function that returns a string or a dictionary
#                      # Or a tuple (regexp, force_match)
#                      # if force_match, then return "" if there is no match
#                      # else, return the unmodified string if there is no match
#                      # If a function, return a string for most cases. Return a dictionary
#                      # if we're doing ingredient parsing ({'amt':1,'unit':'cup',...})
#                      #
#                      # If a function, we take two args -- value (string or markup) and tag (tag object)
#                      # e.g.
#                      # def post_proc (value, tag): ... return "something"
#
#      ],
#     ...
#    ]
#
# attribute can be: title, servings, image, source, preptime,
# cooktime, instructions, modifications, ingredient_block (string) or ingredient_parsed (dictionary)
#
# attributes can be specified more than once, in which case there will
# be added to. For example, you might want to add various parts of the
# webpage to your instructions.
#
# PATH_TO_TAG is a list of search criteria that zero in on the HTML
# element we want to find, drilling down as necessary (see below)
#
# 'text' or 'markup' specifies whether we want markup when we grab the value
#
# POST_PROCESSING can be a function or a regexp.  If it is a regexp,
# it should have a grouping construct which will contain the text we
# want to keep.
#
# Each instruction in the PATH_TO_TAG is a python dictionary
# containing the following entries
#
# 'regexp': REGEXP a regular expression text search; if this is
#                  specified, anything else is ignored
# 'string': STRING a string text search; if this is specified, 
#                  everything else is ignored
# OR...
#
# 'tag': TAGNAME                  HTML Element Name
# 'attributes':{attname:value}    Attributes on the Element we're looking for
# 'index': INTEGER or [START,END] python index for the count of the tag
#                                 we start counting from 0
#
# EXAMPLES OF RULES
# 
# Here are some examples of how a rule can look.
#
# EXAMPLE 1
#
# Get the 'image' attribute from the IMG tag in the 2nd row of the
# first <table class='recipe'>. (Note: for img, we want the HTML img
# tag with its markup; our importer class will do the right thing and
# grab the image from the src attribute)
#
# <table class='recipe'><tr>...</tr><tr>...<IMG></tr></table>
# 
# ['image',
#  [{'tag':'table',{'class':'recipe'},'index':0},
#   {'tag':'tr','index':1},
#   {'tag':'img','index':0}],
#  'markup']
#
# EXAMPLE 2
#
# Get our "servings" value by searching all the text for the phrase
# "X servings"
#
# ['servings',
#  [{'regexp':'[0-9]+\s+servings?'}],
#  'text',
#  '([0-9]+)\s+servings?'] # our grouping construct will grab just the number.
#
# EXAMPLE 3
#
# Get our instructions from all <p class='body'> elements.
#
# ['instructions',
# [{'tag':'p',
#   'attributes':{'class':'body'},
#   'index':[0,None]
#   }],
# 'text']
#
# For a further sense of how things work, take a look at the actual
# rulesets below.
#
# You can test out rules as you write them by running
# python /usr/lib/python2.4/site-packages/gourmet/importers/html_plugins/__init__.py
# directly at a console.

# This is a simplified ruleset created to import Gourmet's own
# exported HTML files. Gourmet exports considerably neater and easier
# to import HTML than many websites. For examples of how to deal with
# messier HTML, you might look through the other html_plugins sets for
# e.g. allrecipes.com or epicurious.com

# canonical name of website
SUPPORTED_URLS = ['www.mywebsite.com']

# Regular expressions that match websites for which this files rules
# should be used.  You can simply use an empty list, written as [], if
# all websites will contain the canonical name (i.e. if all recipes on
# www.superrecipes.com begin with www.superrecipes.com
SUPPORTED_URLS_REGEXPS = ['.*\.mywebsite\.com','.*\.mywebsitemirror\.com']

# Below are rules for scraping a web page.
#
# These are rather complicated
RULES = [
    # Each rule is a list, contained in brackets.

    # Each rule starts with the name of the attribute, which can be
    # one of the following: title, servings, preptime, cooktime,
    # rating, instructions, modifications, ingredient_block (the chunk
    # of ingredients) or ingredient_parsed (a single dictionary or
    # list of dictionaries, with one dictionary per ingredient).
    #
    ['title', # attribute we're after
     [{'tag':'title'}], # path to tag we want (in this case, just the title)
     'text', # this says we want the contents of the tag
     ],
    ['category',
     [{'tag':'div',
       'class':'recipe'}, # inside of <div class="recipe">...
      {'tag':'p',
       'class':'category'} # inside of <p class="category">
      ],
     'text',
     ],
    ['image',
     [{'tag':'img'}],
     'src' # grab the value of the src attribute -- we'll correctly grab the image from the file
     ],
    
    ] # end RULES
