# robots.txt
#
# http://www.4cheaters.de/

# Sitemap: /sitemap.xml

User-agent: *
Disallow: /cgi-bin
Disallow: /wp-admin/
Disallow: /wp-includes/
Disallow: /wp-
Disallow: /trackback/
Disallow: /linkex
Disallow: /admin
Disallow: /amazon
Disallow: /banner
Disallow: /i
Disallow: /sys
Disallow: /modules
Disallow: /cdn
Disallow: /js
Disallow: /libs

User-agent: Googlebot
Disallow: /*.js$
Disallow: /*.inc$
Disallow: /*.css$
Disallow: /*.gz$
Disallow: /*.wmv$
Disallow: /*.cgi$
Disallow: /*.xhtml$


# Prefetching
User-agent: Fasterfox
Disallow: /

# Crawlers that are kind enough to obey, but which we'd rather not have
# unless they're feeding search engines.
User-agent: UbiCrawler
Disallow: /

User-agent: DOC
Disallow: /

User-agent: Zao
Disallow: /

# We are just a few Seattle based guys trying to figure out how to make internet data as open as possible.
# http://www.dotnetdotcom.org/
User-agent: dotbot
Disallow: / 

User-agent: discobot
Disallow: / 

User-agent: ccbot
Disallow: / 

User-agent: VoilaBot
Disallow: / 

User-agent: MJ12bot
Disallow: /

User-agent: Yanga
Disallow: /

User-agent: Speedy
Disallow: /

User-agent: duggmirror
Disallow: /

User-agent: Similarpages
Disallow: /

User-agent: Nutch
Disallow: /

# Some bots are known to be trouble, particularly those designed to copy
# entire sites. Please obey robots.txt.
User-agent: sitecheck.internetseer.com
Disallow: /

User-agent: Zealbot
Disallow: /

User-agent: MSIECrawler
Disallow: /

User-agent: SiteSnagger
Disallow: /

User-agent: WebStripper
Disallow: /

User-agent: WebCopier
Disallow: /

User-agent: Fetch
Disallow: /

User-agent: Offline Explorer
Disallow: /

User-agent: Teleport
Disallow: /

User-agent: TeleportPro
Disallow: /

User-agent: WebZIP
Disallow: /

User-agent: linko
Disallow: /

User-agent: HTTrack
Disallow: /

User-agent: Microsoft.URL.Control
Disallow: /

User-agent: Xenu
Disallow: /

User-agent: larbin
Disallow: /

User-agent: libwww
Disallow: /

User-agent: ZyBORG
Disallow: /

User-agent: Download Ninja
Disallow: /

User-agent: psbot
Disallow: /
 
User-agent: Python-urllib
Disallow: /
  
User-agent: lwp-trivial/1.34
Disallow: /
 
User-agent: lwp-trivial
Disallow: /
 
User-agent: lwp-request
Disallow: / 

User-agent: Cazoodle
Disallow: /


#
# Sorry, wget in its recursive mode is a frequent problem.
# Please read the man page and use it properly; there is a
# --wait option you can use to set the delay between hits,
# for instance.
#
User-agent: wget
Disallow: /

#
# The 'grub' distributed client has been *very* poorly behaved.
#
User-agent: grub-client
Disallow: /

#
# Doesn't follow robots.txt anyway, but...
#
User-agent: k2spider
Disallow: /

#
# Hits many times per second, not acceptable
# http://www.nameprotect.com/botinfo.html
User-agent: NPBot
Disallow: /

# A capture bot, downloads gazillions of pages with no public benefit
# http://www.webreaper.net/
User-agent: WebReaper
Disallow: /

User-agent: SBIder
Disallow: /

# Don't know these, but don't want them...
User-agent: asterias
Disallow: /

User-agent: TECOMAC-Crawler
Disallow: /

User-agent: Custo
Disallow: /
 
User-agent: ichiro
Disallow: /
 
User-agent: Sensis Web Crawler
Disallow: /
 
User-agent: IRLbot
Disallow: /
 
# http://www.exabot.com/go/robot
User-agent: Exabot
Disallow: /
 
User-agent: CFNetwork
Disallow: /
 
User-agent: TencentTraveler
Disallow: /


# 2009-06-03

# http://gais.cs.ccu.edu.tw/robot.php
User-agent: Gaisbot
Disallow: /

# http://help.soso.com/webspider.htm
User-agent: Sosospider
Disallow: /

# http://lucene.apache.org/nutch/bot.html
User-agent: Nutch
Disallow: /

# http://CaretByte.com
User-agent: CaretByte
Disallow: /

# http://www.dontbuylists.com/
# http://www.dontbuylists.com/DBLBot_Explained.pdf
User-agent: DBLBot
Disallow: /

# # Kalooga is an internet search engine for photo albums and image galleries
# # http://www.kalooga.com/info.html?page=crawler
# Mozilla/5.0 (compatible; KaloogaBot; http://www.kalooga.com/info.html?page=crawler)
# User-agent: kalooga
# Allow: /

# Charlotte is a spider that is indexing the web for sites to include in its search engine index.
# http://www.searchme.com/support/spider/
User-agent: Charlotte
Disallow: /

# This is one of several experimental search engines produced by Thunderstone's R&D group whose mission is to advance our overall technology leadership.
# http://search.thunderstone.com/texis/websearch/about.html
User-agent: thunderstone
Disallow: /

# # http://help.live.com/help.aspx?project=wl_webmasters
# User-agent: MSNbot
# Allow: /

# CatchBot investigates websites for publicly available information about companies, such as a company’s name, address, telephone number and keyword data about a company’s products and services.
# http://www.catchbot.com
User-agent: CatchBot
Disallow: /

# # Startup based out of San Francisco called Topsy Labs. Building something cool.
# # http://labs.topsy.com/butterfly.html
# User-agent: butterfly
# Allow: /

# http://www.sogou.com/docs/help/webmasters.htm#07
User-agent: sogou
Disallow: /

# Allow only specific directories
# http://www.scoutjet.com/
User-agent: ScoutJet
Disallow: /

# http://www.youdao.com/help/webmaster/robot/003/
User-Agent: YodaoBot
Disallow: /

# http://help.naver.com/robots/
User-agent: NaverBot
Disallow: /
User-agent: Yeti
Disallow: /

# http://www.feedsky.com
User-agent: Feedsky
Disallow: /

# 2009-08-03
# http://www.bots-on-para.de/bot.html
User-agent: BotOnParade
Disallow: /

# Tagoobot/3.0; +http://www.tagoo.ru
User-agent: Tagoobot
Disallow: /

# 2009-12-02
# aiHitBot-DS/1.0; +http://www.aihit.com/
User-agent: aiHitBot
Disallow: /

# 2009-12-17
# Mozilla/5.0 (compatible; 008/0.83; http://www.80legs.com/spider.html;) Gecko/2008032620
User-agent: 008
Disallow: /

# 2010-01-04
# TurnitinBot - http://www.turnitin.com/robot/crawlerinfo.html
# TurnitinBot/2.1 (http://www.turnitin.com/robot/crawlerinfo.html)
User-agent: TurnitinBot
Disallow: /

# MLBot (www.metadatalabs.com/mlbot)
User-agent: MLBot
Disallow: /

# 2010-02-02
# Linguee Bot (http://www.linguee.com/bot; bot@linguee.com)
User-agent: Linguee
Disallow: /

# 2010-04-23
# BiWeC: Big Web Corpus http://nlp.fi.muni.cz/projekty/biwec/
User-agent: BiWeC
Disallow: /

# 2013-07-07
# BLEXBot/1.0; +http://webmeup.com/crawler.html
User-agent: BLEXBot
Disallow: /