File: //usr/share/doc/w3m/examples/Bonus/html2latex
#!/usr/bin/ruby
#
#       HTML to LaTeX converter
#         by A. Ito, 16 June, 1997
#
require 'kconv'
# configuration
def gif2eps(giffile,epsfile)
  cmd = "convert #{giffile} #{epsfile}"
  STDERR.print cmd,"\n"
  system cmd
end
###########################################################################
class Tag
  def initialize(str)
    if str =~ /<(.+)>/ then
      str = $1
    end
    tags = str.split
    @tagname = tags.shift.downcase
    @vals = {}
    tags.each do |t|
      if t =~ /=/ then
	tn,tv = t.split(/\s*=\s*/,2)
	tv.sub!(/^"/,"")
	tv.sub!(/"$/,"")
	@vals[tn.downcase] = tv
      else
	@vals[t.downcase] = TRUE
      end
    end
  end
  def tagname
    return @tagname
  end
  def each
    @vals.each do |k,v|
      yield k,v
    end
  end
  def switch(k)
    return @vals[k]
  end
end
class TokenStream
  TAG_START = ?<
  TAG_END = ?>
  AMP_START = ?&
  AMP_END = ?;
  
  AMP_REPLACE_TABLE = {
    '&'   => '\\&',
    '>'    => '$>$',
    '<'    => '$<$',
    ' '  => '~',
    '"'  => '"',
  }
  def initialize(file)
    if file.kind_of?(File) then
      @f = file
    else
      @f = File.new(file)
    end
    @buf = nil
    @bpos = 0
  end
  
  def read_until(endsym)
    complete = FALSE
    tag = []
    begin
      while @bpos < @buf.size
	c = @buf[@bpos]
	if c == endsym then
	  tag.push(c.chr)
	  complete = TRUE
	  @bpos += 1
	  break
	end
	if c == 10 || c == 13 then
	  tag.push(' ')
	else
	  tag.push(c.chr)
	end
	@bpos += 1
      end
      unless complete
	@buf = @f.gets
	@bpos = 0
	break if @f.eof?
      end
    end until complete
    return tag.join('')
  end
    
  def get
    while TRUE
      if @buf.nil? then
	@buf = Kconv.toeuc(@f.gets)
	if @f.eof? then
	  return nil
	end
	@bpos = 0
      end
      if @buf[@bpos] == TAG_START then
	return Tag.new(read_until(TAG_END))
      elsif @buf[@bpos] == AMP_START then
	return replace_amp(read_until(AMP_END))
      else
	i = @bpos
	while i < @buf.size && @buf[i] != TAG_START && @buf[i] != AMP_START
	  i += 1
	end
	r = @buf[@bpos,i-@bpos]
	if i == @buf.size then
	  @buf = nil
	else
	  @bpos = i
	end
	redo if r =~ /^\s+$/
	return r
      end
    end
  end
  public :eof?
  def eof?
    @f.eof?
  end
  def replace_amp(s)
    if AMP_REPLACE_TABLE.key?(s) then
      return AMP_REPLACE_TABLE[s]
    else
      return s
    end
  end
end
def print_header
  print '
\documentstyle[epsf]{jarticle}
\def\hr{\par\hbox to \textwidth{\hrulefill}}
\def\pre{\begin{quote}\def\baselinestretch{0.8}\tt\obeylines}
\def\endpre{\end{quote}}
\makeatletter
\@ifundefined{gt}{\let\gt=\dg}{}
\makeatother
'
end
class Environ_stack
  def initialize(*envs)
    @stack = envs
  end
  def action(tag)
    if tag =~ /^!/ then # comment
      return ["",nil]
    end
    i = @stack.size-1
    while i >= 0
      a = @stack[i].action(tag)
      unless a.nil? then
	return a
      end
      i -= 1
    end
    return nil
  end
  def pop
    @stack.pop
  end
  def push(env)
    @stack.push(env)
  end
  def top
    @stack[@stack.size-1]
  end
  def dup
    @stack.push(top.clone)
  end
end
class Environment
  def initialize(interp)
    @silent = FALSE
    @in_table = FALSE
    @interp = interp;
    @align = nil;
  end
  def action(tag)
    return @interp[tag]
  end
  
  def flush(tok)
    if tok.kind_of?(String) then
      tok = tok.gsub(/&/,"\\&");
      tok = tok.gsub(/%/,"\\%");
      tok = tok.gsub(/#/,"\\#");
      tok = tok.gsub(/\$/,"\\$");
      tok = tok.gsub(/_/,"\\verb+_+");
      tok = tok.gsub(/\^/,"\\verb+^+");
      tok = tok.gsub(/~/,"\\verb+~+");
    end
    if @in_table then
      @table[@table_rows][@table_cols] += tok
    elsif !@silent then
      if !@align.nil? && tok =~ /\n$/ then
	print tok.chop,"\\\\\n"
      else
	print tok
      end
    end
  end
  
  def set_interp(interp)
    @interp = interp
  end
  
  # tag processing methods
  
  # <TITLE>
  def do_silent(tag)
    @silent = TRUE
  end
  
  # </TITLE>
  def undo_silent(tag)
    @silent = FALSE
  end
  
  # <IMG>
  def img_proc(tag)
    src = tag.switch('src')
    newfile = src.sub(/\.GIF/i,".eps")
    gif2eps(src,newfile)
    flush "\\epsfile{file=#{newfile}}\n"
  end
  
  # <TABLE>
  def starttable(tag)
    @table = []
    @tablespan = []
    @table_rows = -1
    @table_cols_max = 0
    @in_table = TRUE
    unless tag.switch('border').nil? then
      @table_border = TRUE
    else
      @table_border = FALSE
    end
  end
  
  # <TR>
  def start_row(tag)
    @table_rows += 1
    @table[@table_rows] = []
    @tablespan[@table_rows] = []
    @table_cols = -1
    @colspan = 1
  end
  
  # <TD>
  def start_col(tag)
    @colspan = tag.switch('colspan')
    if @colspan.nil? then
      @colspan = 1
    else
      @colspan = @colspan.to_i
    end
    @tablespan[@table_rows][@table_cols+1] = @colspan
    @table_cols += @colspan
    if @table_cols > @table_cols_max then
      @table_cols_max = @table_cols
    end
  end
  
  # </TABLE>
  def endtable(tag)
    @in_table = FALSE
    flush "\\begin{tabular}{*{"
    flush @table_cols_max+1
    if @table_border then
      flush "}{|l}|}\n\\hline\n"
    else
      flush "}{l}}\n"
    end
    for i in 0..@table_rows
      j = 0
      while j <= @table_cols
	span = @tablespan[i][j]
	if span == 1 then
	  flush @table[i][j]
	elsif @table_border then
	  form = "|l"
	  if j+span > @table_cols then
	    form = "|l|"
	  end
	  flush "\\multicolumn{"+span.to_s+"}{"+form+"}{"
	  flush @table[i][j+span-1]
	  flush "}"
	else
	  flush "\\multicolumn{"+span.to_s+"}{l}{"
	  flush @table[i][j+span-1]
	  flush "}"
	end
	j += span
	if j <= @table_cols then
	  flush "&"
	end
      end
      flush "\\\\\n"
      flush "\\hline\n" if @table_border
    end
    flush "\\end{tabular}\n"
  end  
  
  # <CENTER>
  def startcenter(tag)
    if @in_table then
      flush "\\hfil"
    else
      flush "\\begin{center}\n"
    end
  end
  
  # </CENTER>
  def endcenter(tag)
    if @in_table then
      flush "\\hfil"
    else
      flush "\\end{center}\n"
    end
  end
  
  # <P>
  def paragraph(tag)
    align = tag.switch('align')
    if align.nil? then
      flush "\\par\n"
      @endparagraph = ""
    else
      align = align.downcase
      case align
      when "left" then
	flush "\\begin{flushleft}\n"
	@endparagraph = "\\end{flushleft}\n"
      when "center" then
	flush "\\begin{center}\n"
	@endparagraph = "\\end{center}\n"
      when "right" then
	flush "\\begin{flushright}\n"
	@endparagraph = "\\end{flushright}\n"
      end
    end
    @align = align
  end
  
  # </P>
  def endparagraph(tag)
    unless @align.nil? then
      @align = nil
      flush @endparagraph
    end
  end
end
enum_interp = {
  'li' => ["\\item ",nil]
}
item_interp = {
  'li' => ["\\item ",nil]
}
desc_interp = {
  'dt' => ["\\item[",nil],
  'dd' => ["]\n",nil]
}
table_interp = {
  'tr' => [:start_row,nil],
  'td' => [:start_col,nil],
  '/tr' => ["",nil],
  '/td' => ["",nil],
}
para_interp = {
  '/p'      => [:endparagraph ,"pop",TRUE],
}
main_interp = {
  'body'    => ["\\begin{document}\n",nil,FALSE],
  '/body'   => ["\\end{document}\n",nil,FALSE],
  'head'    => ["",nil,FALSE],
  '/head'   => ["",nil,FALSE],
  'html'    => ["",nil,FALSE],
  '/html'   => ["",nil,FALSE],
  'title'   => [:do_silent,nil,FALSE],
  '/title'  => [:undo_silent,nil,FALSE],
  '!'       => ["",nil,FALSE],
  'h1'      => ["\\section{",nil,TRUE],
  'h2'      => ["\\subsection{",nil,TRUE],
  'h3'      => ["\\subsubsection{",nil,TRUE],
  'h4'      => ["\\paragraph{",nil,TRUE],
  '/h1'     => ["}\n",nil,TRUE],
  '/h2'     => ["}\n",nil,TRUE],
  '/h3'     => ["}\n",nil,TRUE],
  '/h4'     => ["}\n",nil,TRUE],
  'a'       => ["",nil,TRUE],
  '/a'      => ["",nil,TRUE],
  'center'  => [:startcenter,nil,TRUE],
  '/center' => [:endcenter,nil,TRUE],
  'ol'      => ["\\begin{enumerate}\n",enum_interp,TRUE],
  '/ol'     => ["\\end{enumerate}\n","pop",TRUE],
  'ul'      => ["\\begin{itemize}\n",item_interp,TRUE],
  '/ul'     => ["\\end{itemize}\n","pop",TRUE],
  'dl'      => ["\\begin{description}\n",desc_interp,TRUE],
  '/dl'     => ["\\end{description}\n","pop",TRUE],
  'pre'     => ["\\begin{pre}\n",nil,TRUE],
  '/pre'    => ["\\end{pre}\n",nil,TRUE],
  'p'       => [:paragraph ,para_interp,TRUE],
  'br'      => ["\\par ",nil,TRUE],
  'img'     => [:img_proc,nil,TRUE],
  'hr'      => ["\\hr ",nil,TRUE],
  'b'       => ["{\\bf\\gt ",nil,TRUE],
  '/b'      => ["}",nil,TRUE],
  'strong'  => ["{\\bf\\gt ",nil,TRUE],
  '/strong' => ["}",nil,TRUE],
  'dfn'     => ["{\\bf\\gt ",nil,TRUE],
  '/dfn'    => ["}",nil,TRUE],
  'i'       => ["{\\it",nil,TRUE],
  '/i'      => ["}",nil,TRUE],
  'address' => ["{\\it",nil,TRUE],
  '/address'=> ["}",nil,TRUE],
  'cite'    => ["{\\it",nil,TRUE],
  '/cite'   => ["}",nil,TRUE],
  'code'    => ["{\\tt",nil,TRUE],
  '/code'   => ["}",nil,TRUE],
  'kbd'     => ["{\\tt",nil,TRUE],
  '/kbd'    => ["}",nil,TRUE],
  'tt'      => ["{\\tt",nil,TRUE],
  '/tt'     => ["}",nil,TRUE],
  'samp'    => ["{\\tt",nil,TRUE],
  '/samp'   => ["}",nil,TRUE],
  'em'      => ["{\\em",nil,TRUE],
  '/em'     => ["}",nil,TRUE],
  'u'       => ["$\\underline{\\mbox{",nil,TRUE],
  '/u'      => ["}}$",nil,TRUE],
  'sub'     => ["${}_\mbox{",nil,TRUE],
  '/sub'    => ["}$",nil,TRUE],
  'sup'     => ["${}^\mbox{",nil,TRUE],
  '/sup'    => ["}$",nil,TRUE],
  'table'   => [:starttable, table_interp,TRUE],
  '/table'  => [:endtable, "pop",TRUE],
  'font'    => ["",nil,TRUE],
  '/font'   => ["",nil,TRUE],
}
################################ MAIN ####################################
$in_document = FALSE
print_header
intp = Environ_stack.new(Environment.new(main_interp))
f = TokenStream.new(ARGV[0])
until f.eof?
  tok = f.get
  if tok.kind_of?(Tag) then
    case tok.tagname
    when "body"
      $in_document = TRUE
    when "/body"
      $in_document = FALSE
    end
    act = intp.action(tok.tagname)
    if act.nil? then
      STDERR.print "tag ",tok.tagname," ignored\n"
    else
      if act[2] && !$in_document then
        print "\\begin{document}\n"
	$in_document = TRUE
      end
      # environment push
      if act[1].kind_of?(Hash) &&
	  (tok.tagname != "p" || tok.switch('align') != nil) then
	  intp.dup
	  intp.top.set_interp(act[1])
      end
      
      if act[0].kind_of?(String) then
	intp.top.flush act[0]
      elsif act[0].kind_of?(Fixnum) then # interned symbol
	intp.top.send(act[0],tok)
      end
      
      # environment pop
      if act[1] == "pop" then
	intp.pop
      end
    end
  elsif !tok.nil? then
    intp.top.flush tok
  end
end
if $in_document then
  print "\\end{document}\n"
end