270 lines
8.8 KiB
Python

import sys
import os
import os.path
import datetime
import argparse
from xml.etree.ElementTree import Element, SubElement, tostring
from log import Log
from texparser import TexParser
from latexparser import LatexParser
from gettexfile import file_has_suffix
from gettexfile import get_tex_file
from xiwi.common.misc import buildFileList
from xiwi.common import arxivid
from xiwi.common.stats import Statistics
def str_contains(s1, s2):
return s1.find(s2) != -1
def str_contains_one_of(st, st_list):
for st2 in st_list:
if str_contains(st, st2):
return True
return False
def detect_file_kind(file_obj):
"""Simple detection of kind of source file."""
kind = 'unknown'
firstline = file_obj.readline()
while firstline.isspace():
firstline = file_obj.readline()
if firstline.startswith('%!PS'):
kind = 'PS'
elif firstline.startswith('%auto-ignore'):
kind = 'auto-ignore'
else:
file_obj.seek(0)
for line in file_obj:
if str_contains(line, '\\def'):
# might be tex, if we don't find anything else
kind = 'tex'
if str_contains(line, '\\input'):
# might be tex, if we don't find anything else
kind = 'tex'
if str_contains(line, 'amstex') or str_contains(line, 'harvmac'):
# definitely tex
kind = 'tex'
break
if str_contains(line, '\\documentclass'):
# definitely latex
kind = 'latex'
break
if str_contains(line, '\\documentstyle'):
# could be tex or latex
if str_contains(line, 'amsppt'):
kind = 'tex'
break
else:
kind = 'latex'
break
file_obj.seek(0)
return kind
class WithdrawnPaper(object):
def __init__(self):
pass
def __getitem__(self, item):
if item == 'refs':
return []
elif item == 'success':
return True
def parse(self):
pass
def process_article(filename):
"""Returns TexParserBase derived object on success, None on failure."""
# get the tex file
filename, file_obj, tarfile_obj = get_tex_file(filename)
if file_obj is None:
return None
# detect the type of file
kind = detect_file_kind(file_obj)
# act on the type of file
parser = None
if kind == 'PS':
print('skipping postscript file')
elif kind == 'auto-ignore':
print('asked to ignore file, most likely it was withdrawn')
parser = WithdrawnPaper()
if kind == 'tex':
print('parsing as TeX')
parser = TexParser(filename, file_obj, tarfile_obj)
elif kind == 'latex':
print('parsing as LaTeX')
parser = LatexParser(filename, file_obj, tarfile_obj)
else:
print('cannot determine kind of file')
# attempt to parse the document
try:
if parser is not None:
parser.parse()
except Exception as e:
print('exception while trying to parse file:')
print(str(e))
parser = None
# close the files
file_obj.close()
if tarfile_obj is not None:
tarfile_obj.close()
# return the parsed document
return parser
arxiv_classes = [
'acc-phys', 'adap-org', 'alg-geom', 'ao-sci', 'astro-ph', 'atom-ph',
'bayes-an', 'chao-dyn', 'chem-ph', 'cmp-lg', 'comp-gas', 'cond-mat',
'cs', 'dg-ga', 'funct-an', 'gr-qc', 'hep-ex', 'hep-lat',
'hep-ph', 'hep-th', 'math', 'math-ph', 'mtrl-th', 'nlin',
'nucl-ex', 'nucl-th', 'patt-sol', 'physics', 'plasm-ph', 'q-alg',
'q-bio', 'quant-ph', 'solv-int', 'supr-con'
]
def do_single_file(file_name, print_xml, write_xml_dir):
arxiv_id, arxiv_version = arxivid.filenameToArxivAndVersion(file_name)
if arxiv_id is None:
print('WARN: could not determine arXiv identifier for', file_name)
arxiv_id = '<unknown>'
arxiv_version = 0
Log.reset()
Statistics.begin_item(arxiv_id)
if file_has_suffix(file_name, '.pdf'):
Statistics.count('1) pdf')
succ = True
else:
Statistics.count('2) processed')
parser = process_article(file_name)
if parser is not None :
succ = parser['success']
bib_refs = parser['refs']
else :
succ = False
bib_refs = []
if str_contains_one_of(arxiv_id, ['gr-qc', 'hep-']):
Statistics.count('hep-processed')
if succ:
Statistics.count('hep-success')
if succ:
print('-success--------')
Statistics.count('3) success')
else:
print('-fail-----------')
Statistics.count('4) fail')
show_ref = False
if succ and show_ref:
for bib_ref in bib_refs:
print(bib_ref.key, 'with', bib_ref.cite_count, 'citations in paper')
if len(bib_ref.bib_info) == 0:
print('no reference')
else:
print(bib_ref.bib_info_as_str(keep_comments=True))
if succ and (print_xml or write_xml_dir):
xml = Element('article')
SubElement(xml, 'id').text = arxiv_id
if arxiv_version > 0:
SubElement(xml, 'version').text = str(arxiv_version)
refs = SubElement(xml, 'refs')
for bib_ref in bib_refs:
bib_text = bib_ref.bib_info_as_str(keep_comments=True)
if len(bib_text) != 0:
ncites = bib_ref.cite_count
if ncites < 1:
ncites = 1
ref = SubElement(refs, 'ref', order=str(bib_ref.ref_order_num), freq=str(ncites))
ref.text = bib_text
if print_xml:
print(tostring(xml))
if isinstance(write_xml_dir, str):
if arxiv_id != '<unknown>':
xml_file_name = os.path.join(write_xml_dir, arxiv_id.replace('/', '') + '.xml')
else:
fname = os.path.split(file_name)[1]
if fname.rfind('.') > 0:
fname = fname[:fname.rfind('.')]
xml_file_name = write_xml_dir + '/' + fname + '.xml'
file_obj = open(xml_file_name, 'wb')
file_obj.write(tostring(xml, encoding='utf-8'))
file_obj.close()
Statistics.end_item()
return succ
summaryStrs = []
if __name__ == "__main__":
cmd_parser = argparse.ArgumentParser(description='Parse TeX/LaTeX to find references.')
cmd_parser.add_argument('--filelist', action='store_true', help='file names on the command line each contain a list of files to process')
cmd_parser.add_argument('--print-xml', action='store_true', help='print XML output to stdout')
cmd_parser.add_argument('--write-xml', metavar='<dir>', help='destination directory to write XML output files')
cmd_parser.add_argument('--failed', metavar='<file>', help='output file to write list of failed files')
cmd_parser.add_argument('files', nargs='+', help='input files')
args = cmd_parser.parse_args()
# print date stamp
timeStart = datetime.datetime.now()
print('[ptex] started processing at', str(timeStart))
print('given', len(args.files), 'files, first file:', args.files[0])
print('================')
Statistics.clear('article')
# build list of files to process
file_list = buildFileList(args.filelist, args.files)
# ensure the destination directory exists
if args.write_xml is not None and os.path.exists(args.write_xml):
try:
os.makedirs(args.write_xml)
except:
pass
# process the files
failed_files = []
for file_name in file_list:
success = do_single_file(file_name, args.print_xml, args.write_xml)
if not success:
failed_files.append(file_name)
# write the failed files to an output file, if requested
if args.failed is not None:
file_obj = open(args.failed, 'w')
file_obj.writelines(f + '\n' for f in failed_files)
file_obj.close()
print('================')
Statistics.show()
Statistics.show_detail('fail')
#Statistics.show_detail('cite-range')
#Statistics.show_detail('bad-ascii')
#Statistics.show_detail('non-ascii')
print('================')
# print date stamp
timeEnd = datetime.datetime.now()
print('[ptex] finished processing at', str(timeEnd))
# print summary for email
summaryStrs.extend(Statistics.get_summary())
summaryStrs.insert(0, 'started processing at %s, took %.1f minutes' % (timeStart.strftime('%H:%M'), (timeEnd - timeStart).total_seconds() / 60))
for s in summaryStrs:
print('**SUMMARY** [ptex]', s)