Python parser for a (simple) LaTeX file - Python

TopAnswers Python

Meta

Databases

TeX

Code Golf

APL

C++

.net

db<>fiddle

Java

*nix

PHP

PowerShell

Python

Rust

टेक्-मराठी

Typst

Web Client Dev

Web Server Dev

Python parser for a (simple) LaTeX file

add tag

5 years ago JeT

My question

How do I handle the extraction of the .tex file (as described below) to a flat .csv with format below ?

Posted here originally

TeX_TA

Context

I created all my MCQ with the exam package. However, exams are now 100% online… (sigh)

My LaTeX file has the basic following format

\documentclass[12pt]{exam}

\begin{document}

\begin{questions}
 \question What is the answer ? 
 \begin{oneparchoices}
 \choice 70
 \choice 75
 \choice 80
 \CorrectChoice 85
 \choice None of the above
 \end{oneparchoices}

 \question What is the answer to the second question ? 
 \begin{oneparchoices}
 \choice 70
 \choice 75
 \CorrectChoice 80
 \choice 85
 \choice None of the above
 \end{oneparchoices}

\end{questions}
\end{document}

xxxxxxxxxx
 
\documentclass[12pt]{exam}​\begin{document}​\begin{questions} \question What is the answer ?  \begin{oneparchoices} \choice 70 \choice 75 \choice 80 \CorrectChoice 85 \choice None of the above \end{oneparchoices}​ \question What is the answer to the second question ?  \begin{oneparchoices} \choice 70 \choice 75 \CorrectChoice 80 \choice 85 \choice None of the above \end{oneparchoices}​\end{questions}\end{document}
show all 25 lines

I need now to provide a csv where the questions of the MCQ above would be displayed like (Incorrect,Correct, just to be clear 😃 )

question,answer1,Cor/Inc,answer2,Cor/Inc,answer3,Cor/Inc ,answer4,Cor/Inc,answer5,Cor/Inc

And it would render like

What is the answer ?,70,Inc,75,Inc,80,Inc,85,Cor,None of the above,Inc

Each line would obviously be a new question.

What could correspond so far

I found something interesting in python, but I am more open to a solution than a type of programming.

I see the principle for environment between \begin and \end thanks to https://stackoverflow.com/questions/11054008/extract-figures-from-latex-file

infile = open('MCQ.tex', 'r')
outfile = open('FlattenMCQ.csv', 'w')
extract_block = False
for line in infile:
    if 'begin{questions}' in line:
        extract_block = True
    if extract_block:
        outfile.write(line)
    if 'end{questions}' in line:
        extract_block = False
        outfile.write("------------------------------------------\n\n")

infile.close()
outfile.close()

xxxxxxxxxx
 
infile = open('MCQ.tex', 'r')outfile = open('FlattenMCQ.csv', 'w')extract_block = Falsefor line in infile:    if 'begin{questions}' in line:        extract_block = True    if extract_block:        outfile.write(line)    if 'end{questions}' in line:        extract_block = False        outfile.write("------------------------------------------\n\n")​infile.close()outfile.close()
show all 14 lines

Where I am stuck The recurisivity to test first \begin{questions} then \question then \begin{oneparchoices} then \choice or \CorrectChoice

Top Answer

5 years ago wizzwizz4

This code’s very messy, and only parses TeX in the very specific format you’ve provided, and won’t always give you errors if the input document is “malformed”, but it should work:

import csv
from functools import partial
from itertools import takewhile

def questions(lines):
    lines = iter(lines)
    for line in lines:
        if r'\begin{questions}' in line:
            break
    while True:
        for line in lines:
            if r'\end{questions}' in line:
                return
            q = line.split(r'\question', maxsplit=1)
            if len(q) == 2:
                question = q[1].strip()
                break
        if r'\begin{oneparchoices}' not in next(lines):
            raise ValueError(r"Expected \begin{oneparchoices}")
        yield question, tuple(
            (answer.strip(), r'Correct' in choice)
            for choice, answer in map(
                partial(str.split, maxsplit=1),
                takewhile(
                    lambda line: r'\end{oneparchoices}' not in line,
                    lines
                )
            )
        )

def q_flatten(questions):
    for question, answers in questions:
        yield (question,) + tuple(a_flatten(answers))

def a_flatten(answers):
    for answer, correct in answers:
        yield answer
        if correct:
            yield 'Cor'
        else:
            yield 'Inc'

with open('MCQ.tex') as in_, \
     open('FlattenMCQ.csv', 'w', newline='') as out:
    writer = csv.writer(out)
    # If you want a header, add it here.
    ##writer.writerow(("column1", "column2", "etc."))
    writer.writerows(q_flatten(questions(in_)))

xxxxxxxxxx
 
import csvfrom functools import partialfrom itertools import takewhile​def questions(lines):    lines = iter(lines)    for line in lines:        if r'\begin{questions}' in line:            break    while True:        for line in lines:            if r'\end{questions}' in line:                return            q = line.split(r'\question', maxsplit=1)            if len(q) == 2:                question = q[1].strip()                break        if r'\begin{oneparchoices}' not in next(lines):            raise ValueError(r"Expected \begin{oneparchoices}")        yield question, tuple(            (answer.strip(), r'Correct' in choice)            for choice, answer in map(                partial(str.split, maxsplit=1),                takewhile(                    lambda line: r'\end{oneparchoices}' not in line,                    lines                )            )        )​def q_flatten(questions):    for question, answers in questions:        yield (question,) + tuple(a_flatten(answers))​def a_flatten(answers):    for answer, correct in answers:        yield answer        if correct:            yield 'Cor'        else:            yield 'Inc'​with open('MCQ.tex') as in_, \     open('FlattenMCQ.csv', 'w', newline='') as out:    writer = csv.writer(out)    # If you want a header, add it here.    ##writer.writerow(("column1", "column2", "etc."))    writer.writerows(q_flatten(questions(in_)))
show all 48 lines

Answer #2

5 years ago wizzwizz4

If you have pip, and can install the TexSoup package (pick one):

> py -3 -m pip install TexSoup
$ python3 -m pip install TexSoup
$ python -m pip install TexSoup

xxxxxxxxxx
 
> py -3 -m pip install TexSoup$ python3 -m pip install TexSoup$ python -m pip install TexSoup

then this would probably be more resilient to TeX formatting changes, but needs to load the entire file into memory so wouldn’t work for as large quizzes.

import csv

from TexSoup import TexSoup

def questions(tex):
    soup = TexSoup(tex)
    i = iter(soup[-1][0])  # \begin{questions}
    while True:
        try:
            next(i)  # skip over \question
        except StopIteration:
            return
        question = next(i).strip()
        answers = tuple(
            (answer.strip(), choice.name == 'CorrectChoice')
            for choice, answer in zip(*(2*(iter(next(i)),)))
        )
        yield question, answers

# The below is the same as my other answer:
def q_flatten(questions):
    for question, answers in questions:
        yield (question,) + tuple(a_flatten(answers))

def a_flatten(answers):
    for answer, correct in answers:
        yield answer
        if correct:
            yield 'Cor'
        else:
            yield 'Inc'

with open('MCQ.tex') as in_, \
     open('FlattenMCQ.csv', 'w', newline='') as out:
    writer = csv.writer(out)
    # If you want a header, add it here.
    ##writer.writerow(("column1", "column2", "etc."))
    writer.writerows(q_flatten(questions(in_)))

xxxxxxxxxx
 
import csv​from TexSoup import TexSoup​def questions(tex):    soup = TexSoup(tex)    i = iter(soup[-1][0])  # \begin{questions}    while True:        try:            next(i)  # skip over \question        except StopIteration:            return        question = next(i).strip()        answers = tuple(            (answer.strip(), choice.name == 'CorrectChoice')            for choice, answer in zip(*(2*(iter(next(i)),)))        )        yield question, answers​# The below is the same as my other answer:def q_flatten(questions):    for question, answers in questions:        yield (question,) + tuple(a_flatten(answers))​def a_flatten(answers):    for answer, correct in answers:        yield answer        if correct:            yield 'Cor'        else:            yield 'Inc'​with open('MCQ.tex') as in_, \     open('FlattenMCQ.csv', 'w', newline='') as out:    writer = csv.writer(out)    # If you want a header, add it here.    ##writer.writerow(("column1", "column2", "etc."))    writer.writerows(q_flatten(questions(in_)))
show all 38 lines

2 Answers