File: //usr/share/texlive/texmf-dist/scripts/texdoc/texdoclib-score.tlu
-- texdoclib-score.tlu: scoring functions for texdoc
--
-- The TeX Live Team, GPLv3, see texdoclib.tlu for details
-- dependencies
local md5 = require 'md5'
local texdoc = {
util = require 'texdoclib-util',
config = require 'texdoclib-config',
}
-- shortcuts
local M = {}
-- shared variables
local global_adjscore, spec_adjscore = {}, {}
------------------------- configuration directives -------------------------
-- set key in score table to val, without overriding
local function set_score_table(tab, key, val)
local k = string.lower(key)
local v = tonumber(val)
if v then
if tab[k] == nil then tab[k] = v end
return true
end
return false
end
-- interpret a confline as a score directive or return false
function M.confline_to_score(line, file, pos)
local keyw, pat, val
-- try global adjscore
pat, val = string.match(line, '^adjscore%s+([%w%p]+)%s*=%s*([%d+-.]+)')
if pat and val then
return set_score_table(global_adjscore, pat, val)
end
-- try keyword specific adjscore
keyw, pat, val = string.match(line,
'^adjscore%(([%w%p]+)%)%s+([%w%p]+)%s*=%s*([%d+-.]+)')
if keyw and pat and val then
keyw = string.lower(keyw)
spec_adjscore[keyw] = spec_adjscore[keyw] or {}
return set_score_table(spec_adjscore[keyw], pat, val)
end
return false
end
---------------------------- score computation -----------------------------
-- says if pat is a "subword" of str
local function is_subword(str, pat)
local function is_delim(str, i)
return not not string.find(string.sub(str, i, i), '%p')
end
local i, j = string.find(str, pat, 1, true)
return not not (i and j
and (i == 1 or is_delim(str, i) or is_delim(str, i-1))
and (j == #str or is_delim(str, j) or is_delim(str, j+1)))
end
-- says if a filename has a bad basename
local function has_bad_basename(file)
file = file:gsub('.*/', '')
for _, b in ipairs(texdoc.config.get_value('badbasename_list')) do
if file:find('^' .. b .. '$') or file:find('^' .. b .. '%.') then
return true
end
end
return false
end
-- compute a heuristic score -10 <= s < 10
local function heuristic_score(file, pat, dbg_score)
dbg_score('Start heuristic scoring with pattern: ' .. pat)
-- score management
local score = -10
local function upscore(s, reason, force)
if s > score or force then
score = s
dbg_score('New heuristic score: %.1f. Reason: %s', s, reason)
end
end
local slash = not not string.find(pat, '/', 1, true)
-- look for exact or subword match
if M.is_exact_locale(file, pat) then
upscore(5, 'exact match with correct locale')
elseif M.is_exact(file, pat) then
upscore(4, 'exact match')
elseif is_subword(file, pat) then
upscore(1, 'subword match')
end
-- try derivatives unless pat contains a slash
if not slash then
for _, suffix in ipairs(texdoc.config.get_value('suffix_list')) do
local deriv = pat .. suffix
if M.is_exact(file, deriv) then
upscore(4.5, 'exact match for derived pattern: ' .. deriv)
elseif is_subword(file, deriv) then
upscore(3.5, 'subword match for derived pattern: ' .. deriv)
end
end
end
-- if extension is bad, score becomes an epsilon
local ext = texdoc.config.get_value('ext_list')[M.ext_pos(file)]
if ext and texdoc.config.get_value('badext_list_inv')[ext] and score > 0 then
upscore(0.1, 'bad extension', true)
end
-- if basename is bad, score becomes an epsilon
if has_bad_basename(file) and score > 0 then
upscore(0.1, 'bad basename', true)
end
-- bonus for being in the right directory
if string.find('/' .. file, '/' .. pat .. '/', 1, true) and not slash then
upscore(score + 1.5, 'directory bonus')
end
-- done
dbg_score('Final heuristic score: %.1f', score)
return score
end
-- set the score of a docfile
local function set_score(df, original_kw)
-- scoring is case-insensitive (patterns are already lowercased)
local name = string.lower(df.normname)
local df_id = string.sub(md5.sumhexa(name), 1, 7)
-- special debugging function
local function dbg_score(msg, ...)
-- add the hash id prefix to make the outputs grep-friendly
local msg = string.format('(%s) ', df_id) .. msg
texdoc.util.dbg_print('score', msg, ...)
end
dbg_score('Start scoring ' .. df.realpath)
dbg_score('Name used: ' .. name)
-- get score from patterns
local score = -10
for _, pat in ipairs(df.matches) do
local s = -10
local p = string.lower(pat.name)
if pat.original then -- non-alias
s = df.tree > -1 and heuristic_score(name, p, dbg_score) or 1
elseif M.is_exact(name, p) then -- alias
local bonus, note = 0, ''
if pat.locale then
bonus, note = 5, ', (language-based)'
end
s = (pat.score or 10) + bonus -- default alias score is 10
dbg_score('Matching alias "%s", score: %.1f%s', pat.name, s, note)
end
if s > score then score = s end
end
dbg_score('Max pattern score: %.1f', score)
-- get score from tlp associations
if score == -10 and df.tlptodoc then
score = -1
dbg_score('New score: %.1f from package name association', score)
end
if score == -10 and df.runtodoc then
score = -5
dbg_score('New score: %.1f from sty/cls association', score)
end
-- bonus for metadata
if df.details then
if string.find(string.lower(df.details), 'readme') then
score = score + 0.1
dbg_score('Catalogue "readme" bonus: +0.1')
else
score = score + 1.5
dbg_score('Catalogue details bonus: +1.5')
end
end
-- adjust from keyword-specific tables
if df.tree > -1 and spec_adjscore[original_kw] then
for pat, val in pairs(spec_adjscore[original_kw]) do
if val and is_subword('/' .. name, pat) then
score = score + val
dbg_score('Adjust by %.1f from specific pattern "%s"', val, pat)
end
end
end
-- adjust from global tables
if df.tree > -1 then
for pat, val in pairs(global_adjscore) do
if val and is_subword('/' .. name, pat) then
if score > -10 or val < 0 then score = score + val end
dbg_score('Adjust by %.1f from global pattern "%s"', val, pat)
end
end
end
dbg_score('Final score: %.1f', score)
-- the final score should be a float value
df.score = score + 0.0
end
-- set the scores for a doclist
local function set_list_scores(list, original_kw)
for _, df in ipairs(list) do
set_score(df, original_kw)
end
end
-- says if file is an exact match for pat
function M.is_exact(file, pat)
file = texdoc.util.parse_zip(file)
local slashes = string.gsub(pat, '[^/]+', '[^/]+')
basename = string.match(file, slashes .. '$')
if not basename then return nil end
if basename == pat then return true end
for _, ext in ipairs(texdoc.config.get_value('ext_list')) do
if ext ~= '' and ext ~= '*' and basename == pat .. '.' .. ext then
return true
end
end
return false
end
-- says if file is an exact match for pat and the current locale
function M.is_exact_locale(file, pat)
if string.match(pat, '%-%l%l%l?$') then
-- don't match if the pattern appears to include a language code
return false
end
local lang = texdoc.config.get_value('lang')
if lang then
return M.is_exact(file, pat .. '-' .. lang)
or M.is_exact(file, lang .. '-' .. pat)
end
return false
end
-- compare two docfile's: (see texdoclib-search.tlu for structure)
-- 1. by score
-- 2. then by extensions (ordered as in ext_list),
-- 3. then lexicographically by normname.
-- 4. then by tree.
-- return true if a is better than b
local function docfile_order(a, b)
if a.score > b.score then return true
elseif a.score < b.score then return false
elseif a.ext_pos < b.ext_pos then return true
elseif a.ext_pos > b.ext_pos then return false
elseif a.normname < b.normname then return true
elseif a.normname > b.normname then return false
else return (a.tree > b.tree)
end
end
----------------------------- public functions -----------------------------
-- returns the index of the most specific extension of file in ext_list,
-- or config.ext_list_max + 1
function M.ext_pos(file)
-- remove zipext if applicable
file = texdoc.util.parse_zip(file)
-- now find the extension
local p, e, pos, ext
for p, e in ipairs(texdoc.config.get_value('ext_list')) do
if (e == '*') and (ext == nil) then
pos, ext = p, e
elseif (e == '') and not string.find(file, '.', 1, true) then
pos, ext = p, e
elseif string.sub(file, -string.len(e)-1) == '.' .. e then
if (ext == nil) or (ext == '*')
or (string.len(e) > string.len(ext)) then
pos, ext = p, e
end
end
end
return pos or (texdoc.config.get_value('ext_list_max') + 1)
end
-- return the "quality" of docfile
function M.docfile_quality(df)
if df.score > 0 then
return 'good'
elseif df.score > -100 then
return 'bad'
else
return 'killed'
end
end
-- sort a doclist
function M.sort_doclist(dl, original_kw)
dl:stop()
set_list_scores(dl, original_kw)
table.sort(dl, docfile_order)
end
return M
-- vim: ft=lua: