File: //lib/ruby/vendor_ruby/webrobots/robotstxt.rb
#
# DO NOT MODIFY!!!!
# This file is automatically generated by Racc 1.4.9
# from Racc grammer file "".
#
require 'racc/parser.rb'
require 'strscan'
class WebRobots
class Error < StandardError
end
class ParseError < Error
# The site's root URI
attr_reader :site
def initialize(message, site)
@message = message
@site = site
end
def to_s
@message
end
end
class RobotsTxt
class Parser < Racc::Parser
module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 171)
def initialize(target, crawl_delay_handler = nil)
super()
@target = target
@crawl_delay_handler = crawl_delay_handler
end
def parse!(input, site)
parse(input, site)
rescue Error => e
RobotsTxt.new(site, nil,
:error => e,
:target => @target,
:crawl_delay_handler => @crawl_delay_handler)
end
KNOWN_TOKENS = %w[User-agent Allow Disallow Crawl-delay Sitemap]
RE_KNOWN_TOKENS = /\A(#{KNOWN_TOKENS.map { |t| Regexp.quote(t) }.join('|')})\z/i
def parse(input, site)
@q ||= []
@errors = []
@lineno = 0
@site = site
string = input.respond_to?(:read) ? input.read : input
s = StringScanner.new(string)
value_expected = false
until s.eos?
@lineno += 1 if s.bol?
if t = s.scan(/[ \t]*(?:\r?\n|\z)/)
if value_expected
@q << [:VALUE, '']
end
@q << [:EOL, t]
value_expected = false
elsif t = s.scan(/[ \t]+/)
@q << [:SPACE, t]
elsif t = s.scan(/:/)
@q << [t, t]
value_expected = true
elsif t = s.scan(/#.*/)
if value_expected
@q << [:VALUE, '']
end
@q << [:COMMENT, t]
else
if value_expected
if t = s.scan(/.*?(?=[ \t]*(?:#|$))/)
@q << [:VALUE, t]
else
parse_error @lineno, "unexpected characters: %s" % s.check(/.*/)
end
value_expected = false
elsif t = s.scan(/[^\x00-\x1f\x7f()<>@,;:\\"\/\[\]?={}]+/)
case t
when RE_KNOWN_TOKENS
@q << [t.downcase, t]
else
@q << [:TOKEN, t]
end
else
parse_error "unexpected characters: %s" % s.check(/.*/)
end
end
end
@q << [:EOL, ''] if !@q.empty? && @q.last.first != :EOL
@pos = -1
do_parse
rescue Racc::ParseError => e
raise ParseError.new(e.message, @site)
ensure
@q.clear
end
def next_token
@q[@pos += 1]
end
def on_error(token_id, value, stack)
parse_error "unexpected %s: %s" % [token_to_str(token_id), value]
end
def parse_error(message)
message = "%s line %d: %s" % [@site.to_s, @lineno, message]
if @lax
@errors << message
else
raise Racc::ParseError, message
end
end
...end robotstxt.ry/module_eval...
##### State transition tables begin ###
racc_action_table = [
5, 12, -10, 16, 52, 40, -12, 36, 37, 38,
39, 12, -10, 16, 46, 27, 27, 36, 37, 38,
39, 12, -10, 16, 49, 50, 51, 36, 37, 38,
39, 12, -10, 16, 12, 53, 24, 36, 37, 38,
39, 12, -10, 16, 12, 12, -12, 12, -10, 16,
60, 12, -13, 16, 60, 12, 12, 16, 60, 12,
12, 16, 60, 12, 12, 16, 60, 12, 23, 16,
60, 12, 62, 16, 63, 64, 65, 66, 5, 9,
5, 6, 5 ]
racc_action_check = [
21, 21, 21, 21, 39, 23, 21, 21, 21, 21,
21, 25, 25, 25, 27, 19, 25, 25, 25, 25,
25, 45, 45, 45, 36, 37, 38, 45, 45, 45,
45, 29, 29, 29, 24, 41, 16, 29, 29, 29,
29, 7, 7, 7, 46, 49, 7, 13, 13, 13,
62, 62, 13, 62, 53, 53, 50, 53, 63, 63,
51, 63, 64, 64, 52, 64, 65, 65, 15, 65,
66, 66, 54, 66, 55, 56, 57, 58, 11, 6,
3, 1, 0 ]
racc_action_pointer = [
80, 81, nil, 78, nil, nil, 79, 38, nil, nil,
nil, 76, nil, 44, nil, 64, 30, nil, nil, 7,
nil, -2, nil, 3, 31, 8, nil, 8, nil, 28,
nil, nil, nil, nil, nil, nil, 18, 19, 20, -2,
nil, 28, nil, nil, nil, 18, 41, nil, nil, 42,
53, 57, 61, 52, 65, 67, 68, 69, 70, nil,
nil, nil, 48, 56, 60, 64, 68, nil, nil, nil,
nil, nil ]
racc_action_default = [
-5, -44, -1, -6, -7, -9, -44, -3, -8, 72,
-2, -5, -11, -23, -14, -44, -44, -18, -19, -44,
-4, -6, -15, -44, -10, -29, -25, -44, -20, -21,
-22, -31, -34, -35, -36, -37, -44, -44, -44, -44,
-16, -44, -24, -26, -27, -30, -10, -32, -33, -10,
-10, -10, -10, -10, -44, -44, -44, -44, -44, -17,
-42, -43, -10, -10, -10, -10, -10, -28, -38, -39,
-40, -41 ]
racc_goto_table = [
14, 41, 8, 47, 3, 2, 22, 17, 29, 11,
18, 26, 45, 10, 14, 21, 20, 43, 44, 47,
8, 28, 48, 54, 30, 25, 55, 56, 57, 58,
59, 42, 7, 1, nil, nil, nil, nil, 48, 67,
68, 69, 70, 71 ]
racc_goto_check = [
11, 8, 7, 19, 6, 2, 11, 13, 15, 5,
14, 18, 15, 3, 11, 6, 2, 18, 11, 19,
7, 13, 11, 8, 14, 16, 8, 8, 8, 8,
12, 17, 4, 1, nil, nil, nil, nil, 11, 12,
12, 12, 12, 12 ]
racc_goto_pointer = [
nil, 33, 5, 6, 30, 2, 4, -1, -23, nil,
nil, -7, -23, 0, 3, -13, 6, 6, -8, -26,
nil, nil, nil, nil ]
racc_goto_default = [
nil, nil, nil, nil, nil, nil, nil, 4, 15, 19,
13, 61, nil, nil, nil, nil, nil, nil, nil, 31,
32, 33, 34, 35 ]
racc_reduce_table = [
0, 0, :racc_error,
0, 17, :_reduce_1,
3, 14, :_reduce_2,
0, 16, :_reduce_none,
2, 16, :_reduce_none,
0, 15, :_reduce_none,
1, 15, :_reduce_none,
1, 19, :_reduce_none,
2, 19, :_reduce_none,
1, 20, :_reduce_none,
0, 21, :_reduce_none,
1, 21, :_reduce_none,
0, 22, :_reduce_none,
1, 22, :_reduce_none,
1, 23, :_reduce_none,
2, 23, :_reduce_none,
3, 24, :_reduce_none,
5, 24, :_reduce_17,
1, 18, :_reduce_18,
1, 18, :_reduce_19,
3, 18, :_reduce_20,
3, 18, :_reduce_21,
3, 18, :_reduce_none,
1, 27, :_reduce_none,
3, 26, :_reduce_24,
1, 29, :_reduce_25,
2, 29, :_reduce_26,
2, 29, :_reduce_none,
5, 31, :_reduce_28,
0, 30, :_reduce_none,
1, 30, :_reduce_none,
1, 28, :_reduce_31,
2, 28, :_reduce_32,
2, 28, :_reduce_none,
1, 32, :_reduce_none,
1, 32, :_reduce_none,
1, 32, :_reduce_none,
1, 32, :_reduce_none,
5, 33, :_reduce_38,
5, 34, :_reduce_39,
5, 35, :_reduce_40,
5, 36, :_reduce_41,
1, 25, :_reduce_none,
1, 25, :_reduce_none ]
racc_reduce_n = 44
racc_shift_n = 72
racc_token_table = {
false => 0,
:error => 1,
:EOL => 2,
:SPACE => 3,
:COMMENT => 4,
"sitemap" => 5,
":" => 6,
:VALUE => 7,
"user-agent" => 8,
"allow" => 9,
"disallow" => 10,
"crawl-delay" => 11,
:TOKEN => 12 }
racc_nt_base = 13
racc_use_result_var = true
Racc_arg = [
racc_action_table,
racc_action_check,
racc_action_default,
racc_action_pointer,
racc_goto_table,
racc_goto_check,
racc_goto_default,
racc_goto_pointer,
racc_nt_base,
racc_reduce_table,
racc_token_table,
racc_shift_n,
racc_reduce_n,
racc_use_result_var ]
Racc_token_to_s_table = [
"$end",
"error",
"EOL",
"SPACE",
"COMMENT",
"\"sitemap\"",
"\":\"",
"VALUE",
"\"user-agent\"",
"\"allow\"",
"\"disallow\"",
"\"crawl-delay\"",
"TOKEN",
"$start",
"robotstxt",
"opt_blanklines",
"body",
"@1",
"records",
"blanklines",
"blankline",
"opt_space",
"opt_commentlines",
"commentlines",
"comment",
"eol_opt_comment",
"record",
"commentblock",
"rulelines",
"agentlines",
"opt_rulelines",
"agentline",
"ruleline",
"allowline",
"disallowline",
"crawldelayline",
"extension" ]
Racc_debug_parser = false
##### State transition tables end #####
# reduce 0 omitted
module_eval(<<'.,.,', 'robotstxt.ry', 7)
def _reduce_1(val, _values, result)
@sitemaps = []
result
end
.,.,
module_eval(<<'.,.,', 'robotstxt.ry', 11)
def _reduce_2(val, _values, result)
body = val[2]
result = RobotsTxt.new(@site, body,
:target => @target,
:sitemaps => @sitemaps,
:crawl_delay_handler => @crawl_delay_handler)
result
end
.,.,
# reduce 3 omitted
# reduce 4 omitted
# reduce 5 omitted
# reduce 6 omitted
# reduce 7 omitted
# reduce 8 omitted
# reduce 9 omitted
# reduce 10 omitted
# reduce 11 omitted
# reduce 12 omitted
# reduce 13 omitted
# reduce 14 omitted
# reduce 15 omitted
# reduce 16 omitted
module_eval(<<'.,.,', 'robotstxt.ry', 44)
def _reduce_17(val, _values, result)
@sitemaps << val[3]
result
end
.,.,
module_eval(<<'.,.,', 'robotstxt.ry', 49)
def _reduce_18(val, _values, result)
result = []
result << val[0]
result
end
.,.,
module_eval(<<'.,.,', 'robotstxt.ry', 54)
def _reduce_19(val, _values, result)
result = []
result
end
.,.,
module_eval(<<'.,.,', 'robotstxt.ry', 60)
def _reduce_20(val, _values, result)
result << val[2]
result
end
.,.,
module_eval(<<'.,.,', 'robotstxt.ry', 66)
def _reduce_21(val, _values, result)
val[2].each_with_index { |line, i|
warn "%s line %d: %s: orphan rule line" %
[@site.to_s, @rulelinenos[i], line.token] if $VERBOSE
}
result
end
.,.,
# reduce 22 omitted
# reduce 23 omitted
module_eval(<<'.,.,', 'robotstxt.ry', 81)
def _reduce_24(val, _values, result)
result = Record.new(val[1], val[2])
result
end
.,.,
module_eval(<<'.,.,', 'robotstxt.ry', 86)
def _reduce_25(val, _values, result)
result = [val[0]]
result
end
.,.,
module_eval(<<'.,.,', 'robotstxt.ry', 91)
def _reduce_26(val, _values, result)
result << val[1]
result
end
.,.,
# reduce 27 omitted
module_eval(<<'.,.,', 'robotstxt.ry', 98)
def _reduce_28(val, _values, result)
result = AgentLine.new(val[0], val[3])
result
end
.,.,
# reduce 29 omitted
# reduce 30 omitted
module_eval(<<'.,.,', 'robotstxt.ry', 106)
def _reduce_31(val, _values, result)
result = [result]
@rulelinenos = []
result
end
.,.,
module_eval(<<'.,.,', 'robotstxt.ry', 112)
def _reduce_32(val, _values, result)
result << val[1]
@rulelinenos << @lineno
result
end
.,.,
# reduce 33 omitted
# reduce 34 omitted
# reduce 35 omitted
# reduce 36 omitted
# reduce 37 omitted
module_eval(<<'.,.,', 'robotstxt.ry', 125)
def _reduce_38(val, _values, result)
result = AllowLine.new(val[0], val[3])
result
end
.,.,
module_eval(<<'.,.,', 'robotstxt.ry', 130)
def _reduce_39(val, _values, result)
result = DisallowLine.new(val[0], val[3])
result
end
.,.,
module_eval(<<'.,.,', 'robotstxt.ry', 135)
def _reduce_40(val, _values, result)
result = CrawlDelayLine.new(val[0], val[3])
result
end
.,.,
module_eval(<<'.,.,', 'robotstxt.ry', 140)
def _reduce_41(val, _values, result)
result = ExtentionLine.new(val[0], val[3])
result
end
.,.,
# reduce 42 omitted
# reduce 43 omitted
def _reduce_none(val, _values, result)
val[0]
end
end # class Parser
def initialize(site, records, options = nil)
@timestamp = Time.now
@site = site
@options = options || {}
@last_checked_at = nil
@error = @options[:error]
@target = @options[:target]
@sitemaps = @options[:sitemaps] || []
@crawl_delay_handler = @options[:crawl_delay_handler]
if records && !records.empty?
@records, defaults = [], []
records.each { |record|
if record.default?
defaults << record
elsif !@target || record.match?(@target)
@records << record
end
}
@records.concat(defaults)
else
@records = []
end
end
attr_reader :timestamp, :site, :sitemaps
attr_accessor :error
def error!
raise @error if @error
end
def target(user_agent = nil)
if user_agent
raise ArgumentError, "this instance is targeted for #{@target}" if @target
user_agent
else
raise ArgumentError, "user_agent is mandatory for an untargeted instance" if !@target
@target
end
end
private :target
def find_record(user_agent = nil)
user_agent = target(user_agent)
@records.find { |record|
record.match?(user_agent)
}
end
private :find_record
def allow?(request_uri, user_agent = nil)
record = find_record(user_agent) or return true
allow = record.allow?(request_uri)
if delay = record.delay and @crawl_delay_handler
@crawl_delay_handler.call(delay, @last_checked_at)
end
@last_checked_at = Time.now
return allow
end
def crawl_delay(user_agent = nil)
record = find_record(user_agent) or return 0
record.delay or return 0
end
def options(user_agent = nil)
record = find_record(user_agent) or return {}
record.options
end
DISALLOW_ALL = <<-TXT
User-Agent: *
Disallow: /
TXT
def self.unfetchable(site, reason, target = nil)
Parser.new(target).parse(DISALLOW_ALL, site).tap { |robots_txt|
robots_txt.error = reason
}
end
class Record
def initialize(agentlines, rulelines)
@patterns = agentlines.map { |agentline| agentline.pattern }
@acls = []
@delay = nil
@options = {}
rulelines.each { |ruleline|
case ruleline
when AccessControlLine
@acls << ruleline
when CrawlDelayLine
@delay = ruleline.delay
else
@options[ruleline.token.downcase] = ruleline.value
end
} if rulelines
@acls.replace @acls.sort_by { |x|
[-x.value.length, x.is_a?(AllowLine) ? -1 : 0]
}
end
attr_reader :delay, :options
def match?(user_agent)
@patterns.any? { |pattern|
pattern.match(user_agent)
}
end
def default?
@patterns.include?(//)
end
def allow?(request_uri)
@acls.each { |acl|
if acl.match?(request_uri)
return acl.allow?
end
}
return true
end
end
class Line
def initialize(token, value)
@token = token
@value = value
compile
end
attr_reader :token, :value
def compile
self
end
end
class AgentLine < Line
def compile
if @value == '*'
@pattern = //
else
@pattern = Regexp.new(Regexp.quote(@value), Regexp::IGNORECASE)
end
self
end
attr_reader :pattern
end
class AccessControlLine < Line
def compile
@empty = @value.empty?
re_src = '\A'
s = StringScanner.new(@value)
until s.eos?
if t = s.scan(/[^%*$]+/)
re_src << Regexp.quote(t)
elsif t = s.scan(/%([0-9a-f]{2})/i)
c = s[1].to_i(16)
if c == 0x2f
re_src << '%2[fF]'
else
re_src << Regexp.quote('%c' % c)
end
elsif t = s.scan(/\*/)
re_src << '.*'
elsif t = s.scan(/\$/)
re_src << '\z'
break
else
re_src << Regexp.quote(s.scan(/./))
end
end
@pattern = Regexp.new(re_src, Regexp::MULTILINE)
self
end
def match?(request_uri)
return false if @empty
transformed = request_uri.gsub(/(%2[fF])|%([0-9a-f]{2})/i) { $1 || '%c' % $2.to_i(16) }
!!@pattern.match(transformed)
end
end
class AllowLine < AccessControlLine
def allow?
true
end
end
class DisallowLine < AccessControlLine
def allow?
false
end
end
class CrawlDelayLine < Line
def compile
case @value
when /\A((0|[1-9][0-9]*)\.[0-9]+)/
@delay = @value.to_f
when /\A(0|[1-9][0-9]*)/
@delay = @value.to_i
else
@delay = nil
end
self
end
attr_reader :delay
end
class ExtentionLine < Line
end
end
end