Package logsparser :: Package extras :: Module robots
[frames] | no frames]

Source Code for Module logsparser.extras.robots

  1  # -*- coding: utf-8 -*- 
  2   
  3  # -*- python -*- 
  4   
  5  # pylogsparser - Logs parsers python library 
  6  # 
  7  # Copyright (C) 2011 Wallix Inc. 
  8  # 
  9  # This library is free software; you can redistribute it and/or modify it 
 10  # under the terms of the GNU Lesser General Public License as published by the 
 11  # Free Software Foundation; either version 2.1 of the License, or (at your 
 12  # option) any later version. 
 13  # 
 14  # This library is distributed in the hope that it will be useful, but WITHOUT 
 15  # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 
 16  # FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 
 17  # details. 
 18  # 
 19  # You should have received a copy of the GNU Lesser General Public License 
 20  # along with this library; if not, write to the Free Software Foundation, Inc., 
 21  # 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 
 22  # 
 23   
 24  """In this module we define a regular expression used to fetch the most common 
 25  robots.""" 
 26   
 27  import re 
 28   
 29  # taken from genrobotlist.pl in the awstats project : http://awstats.cvs.sourceforge.net 
 30  robots = [ 
 31      'antibot', 
 32      'appie', 
 33      'architext', 
 34      'bingbot', 
 35      'bjaaland', 
 36      'digout4u', 
 37      'echo', 
 38      'fast-webcrawler', 
 39      'ferret', 
 40      'googlebot', 
 41      'gulliver', 
 42      'harvest', 
 43      'htdig', 
 44      'ia_archiver', 
 45      'askjeeves', 
 46      'jennybot', 
 47      'linkwalker', 
 48      'lycos', 
 49      'mercator', 
 50      'moget', 
 51      'muscatferret', 
 52      'myweb', 
 53      'netcraft', 
 54      'nomad', 
 55      'petersnews', 
 56      'scooter', 
 57      'slurp', 
 58      'unlost_web_crawler', 
 59      'voila', 
 60      'voyager', 
 61      'webbase', 
 62      'weblayers', 
 63      'wisenutbot', 
 64      'aport', 
 65      'awbot', 
 66      'baiduspider', 
 67      'bobby', 
 68      'boris', 
 69      'bumblebee', 
 70      'cscrawler', 
 71      'daviesbot', 
 72      'exactseek', 
 73      'ezresult', 
 74      'gigabot', 
 75      'gnodspider', 
 76      'grub', 
 77      'henrythemiragorobot', 
 78      'holmes', 
 79      'internetseer', 
 80      'justview', 
 81      'linkbot', 
 82      'metager-linkchecker', 
 83      'linkchecker', 
 84      'microsoft_url_control', 
 85      'msiecrawler', 
 86      'nagios', 
 87      'perman', 
 88      'pompos', 
 89      'rambler', 
 90      'redalert', 
 91      'shoutcast', 
 92      'slysearch', 
 93      'surveybot', 
 94      'turnitinbot', 
 95      'turtlescanner', 
 96      'turtle', 
 97      'ultraseek', 
 98      'webclipping.com', 
 99      'webcompass', 
100      'yahoo-verticalcrawler', 
101      'yandex', 
102      'zealbot', 
103      'zyborg', 
104  ] 
105  robot_regex = re.compile("|".join(robots), re.IGNORECASE) 
106