def parse_md_file_to_react(path, target_dir, file, is_folder_readme=False, is_course_readme=False):
article_data = {}
is_readme = is_folder_readme or is_course_readme
# set article type (article, folder readme, course readme)
article_data['type'] = 'article'
if is_folder_readme: article_data['type'] = 'folder'
if is_course_readme: article_data['type'] = 'course'
# extract article date from first line
article_data['coming_soon'] = False
if not is_readme: # don't extract date from readmes
try:
first, second, rest = file.split('\n', 2)
if second.startswith('created '):
mod_date = datetime.strptime(first, '%d/%m/%Y %H:%M').timestamp()
article_data['mod_timestamp'] = mod_date
article_data['mod_date_time'] = timestamp_to_str(mod_date)
cr_date = datetime.strptime(second.split(' ', 1)[1], '%d/%m/%Y %H:%M').timestamp()
article_data['cr_timestamp'] = cr_date
article_data['cr_date_time'] = timestamp_to_str(cr_date)
file = rest
else: # no creation date supplied
mod_date = datetime.strptime(first, '%d/%m/%Y %H:%M').timestamp()
article_data['mod_timestamp'] = mod_date
article_data['mod_date_time'] = timestamp_to_str(mod_date)
article_data['cr_timestamp'] = mod_date
article_data['cr_date_time'] = timestamp_to_str(mod_date)
file = second + '\n' + rest
except ValueError:
warn(f'no date found on first line of {path}, marking as "coming soon"')
article_data['mod_timestamp'] = PLACEHOLDER_TIMESTAMP
article_data['mod_date_time'] = ''
article_data['cr_timestamp'] = PLACEHOLDER_TIMESTAMP
article_data['cr_date_time'] = ''
article_data['coming_soon'] = True
else:
article_data['mod_timestamp'] = PLACEHOLDER_TIMESTAMP
article_data['mod_date_time'] = ''
article_data['cr_timestamp'] = PLACEHOLDER_TIMESTAMP
article_data['cr_date_time'] = ''
# extract article tags, if they exist
first, rest = file.split('\n', 1)
article_data['tags'] = []
if first.startswith('[TAGS]'):
for tag in first.removeprefix('[TAGS]').split(','):
split_tag = tag.strip().split()
if len(split_tag) == 2:
article_data['tags'] .append({'name': split_tag[0], 'colour': split_tag[1]})
else:
warn(f'something wrong with the tags specified in {path}, skipping tag extraction')
file = rest
# identify article title
titles = re.findall(r'```.*?\n# .*?\n```|\n# (.*?)\n', '\n'+file, re.DOTALL) # identify lines starting with '# ' that aren't inside a code block (might be first line, so prepend \n)
titles = [t for t in titles if t != ''] # for invalid titles, the capture group is empty but still exists, so need to remove them
if len(titles) == 0:
article_data['title'] = 'no_title'
warn(f'no article title found in {path}')
else:
article_data['title'] = titles[0]
if len(titles) > 1:
warn(f'multiple article titles found in {path}, using first one')
# replace \\ with \\\\, because for some reason later \\ is replaced with \ (probably by markdown2)
file = file.replace('\\\\','\\\\\\\\')
# ensure displayed latex is preceded+followed by two newlines (only for lines that start with $$, so we can still have e.g. "> $$asdf$$"), so that markdown2 will wrap it in <p>, so that it gets registered as not the first child in theorems/props etc.
file = re.sub(r'(\n\$\$.*?\$\$)', r'\n\n\1\n\n', file, flags=re.DOTALL)
# ensure that <Spoiler> and </Spoiler> are preceded+followed by two newlines, so that markdown2 will wrap them in p tags (i.e. there won't be junk between spoiler tag and p tag)
file = re.sub(r'<(/?)Spoiler(/?)>\n?([^\n])', r'<\1Spoiler\2>\n\n\3', file)
file = re.sub(r'([^\n])\n?<(/?)Spoiler(/?)>', r'\1\n\n<\2Spoiler\3>', file)
# do the same for <hr> and </hr>
file = re.sub(r'<(/?)hr(/?)>\n?([^\n])', r'<\1hr\2>\n\n\3', file)
file = re.sub(r'([^\n])\n?<(/?)hr(/?)>', r'\1\n\n<\2hr\3>', file)
# do the same for math tags
for tag in math_tags:
file = re.sub(r'<(/?)' + tag + r'([^>]*?)>\n?([^\n])', r'<\1' + tag + r'\2>\n\n\3', file)
file = re.sub(r'([^\n])\n?<(/?)' + tag + r'([^>]*?)>', r'\1\n\n<\2' + tag + r'\3>', file)
# find literal braces, for latex (so that the backslash doesn't die when being parsed)
file = re.sub('\\\\{', '\\{', file)
file = re.sub('\\\\}', '\\}', file)
# in copiable code blocks, add copy buttons
for m in [*re.finditer(r'__COPIABLE__\n```(.*?)\n(.*?)\n```', file, re.DOTALL)][::-1]: # reverse so can edit the string without indices changing
lang, code = m[1], m[2]
copiable = code.replace('\\n', '\\\\n').replace('\n','\\n') # copy button component takes newlines as literals
copiable = copiable.replace('"', '"')
modified = '__COPIABLE__\n\n<CopyButton text="' + copiable + '"/>\n\n```' + lang + '\n' + code + '\n```'
file = file[:m.span()[0]] + modified + file[m.span()[1]:]
# set proofs inside theorem blocks to be unquoted and unbolded
for tag in ['Thm', 'Lemma']:
for m in [*re.finditer(r'(<'+tag+r'[^>]*?>)(.*?)(</'+tag+r'>)', file, re.DOTALL)][::-1]: # reverse so can edit the string without indices changing
modified = re.sub(r'(<Proof[^>]*?)(>.*?)(</Proof>)', r'\1 unquoted unbolded\2\3', m[2], flags=re.DOTALL)
file = file[:m.span()[0]] + m[1] + modified + m[3] + file[m.span()[1]:]
page = markdown(file, extras=['fenced-code-blocks', 'codehilite', 'code-friendly', 'header-ids', 'footnotes', 'wiki-tables'])
# replace curly braces with html character codes, so that react ignores them
page = re.sub('{', '{', page)
page = re.sub('}', '}', page)
# pre doesn't work in nextjs, so find all whitespace inside pre tags and replace with character codes
for m in [i.span() for i in re.finditer(r'(?<=<pre>)(.*?)(?=</pre>)', page, re.DOTALL)][::-1]: # reverse so can edit the string without indices changing
target = page[m[0]:m[1]]
target = re.sub('span ', 'span#', target) # temp
target = re.sub(' ', ' ', target) # replace whitespace
target = re.sub('span#', 'span ', target)
target = re.sub('\n', '<br/>', target) # replace newlines
page = page[:m[0]] + target + page[m[1]:]
# add <Latex> tags
# for blocks inside blockquotes, the background is not white (so add scrollshadow-horizontal-blockquote instead of scrollshadow-horizontal)
page = re.sub(r'<p>\$\$(.*?)\$\$</p>', r'<Latex>\1</Latex>', page) # remove <p> tags that markdown added
page = re.sub(r'\$\$(.*?)\$\$', r'<Latex>\1</Latex>', page) # temporarily remove double dollar signs, so that can deal with single dollar signs first
page = re.sub(r'\$(.+?)\$', r'<Latex>$\1$</Latex>', page)
# need block span instead of div so that doesn't trigger hydration error
for tag in [('Thm',''), ('Lemma',''), ('Proof',''), ('Defn',''), ('Example','quoted'), ('blockquote',''), ('Spoiler', '')]:
tag,quoted = tag
for m in [i.span() for i in re.finditer(r'<'+tag+r'[^>]*?'+quoted+r'.*?>.*?</'+tag+r'>', page, re.DOTALL)][::-1]: # reverse so can edit the string without indices changing
target = page[m[0]:m[1]]
# unquoted blocks aren't grey, so skip
if 'unquoted' in re.findall(r'<.*?>',target)[0]: continue
# replace $$
target = re.sub(r'<Latex>([^$].*?[^$])</Latex>', r'<span className="scrollshadow-horizontal-blockquote latex-display-wrapper"><Latex>$$\1$$</Latex></span>', target)
page = page[:m[0]] + target + page[m[1]:]
page = re.sub(r'<Latex>([^$].*?[^$])</Latex>', r'<span className="scrollshadow-horizontal latex-display-wrapper"><Latex>$$\1$$</Latex></span>', page)
# <p> tags will have been placed around the following tags (on purpose), remove them
for i in ['CopyButton', 'Spoiler', 'hr'] + math_tags:
page = re.sub(r'<p>(</?' + i + r'.*?>)</p>', r'\1', page)
# find h2 tags, add link anchor to them, and generate table of contents from h2 tags (each h2 tag is given a unique id by the header-ids extension)
table_of_contents = [[i.group(2),'#'+i.group(1)] for i in re.finditer(r'<h2 id="(.*?)">(.*?)</h2>', page, re.DOTALL)]
page = add_link_anchors(page, target_dir)
# markdown added class attributes, replace with "className" for react
page = page.replace('class=','className=')
# move copy buttons generated above (i.e. in code blocks marked __COPIABLE__) into their containers
page = re.sub(r'<p>__COPIABLE__</p>\n\n<CopyButton(.*?)/>\n\n<div className="codehilite">\n<pre>(.*?)</pre>\n</div>', r'<div className="codehilite relative">\n<div className="absolute top-2 right-2"><CopyButton\1/></div>\n<pre>\2</pre>\n</div>', page)
page = re.sub(r'<p>__COPIABLE__</p>\n\n<CopyButton(.*?)/>\n\n<pre>(.*?)</pre>', r'<pre className="relative">\n<div className="absolute top-2 right-2"><CopyButton\1/></div>\n\2</pre>', page)
article_data['content'] = flatten_content(page, article_data['title'])
article_data['id'] = hash(path.removeprefix(ROOT_DIR))
path_list = sanitize(beautify(path)).split('/')[1:-1] # path to parent folder
article_data['dir'] = path_list if not is_readme else path_list[:-1] # readmes should be elevated
article_name = target_dir.split('/')[-1]
article_data['name'] = article_name
page_title = article_name.title().replace('-',' ')
copiable_article_plaintext = None
if COURSE_INDICATOR not in path: # don't add "copy article plaintext" button to course content
plaintext = article_data['content']
plaintext = eval('"'+ plaintext.replace('"','\\"')+'"')
plaintext = html.escape(plaintext)
plaintext = plaintext.replace('"', '"')
# replace braces for nextjs
plaintext = plaintext.replace('{', '{').replace('}', '}')
plaintext = plaintext.replace('\\n', '\\\\n').replace('\n', '\\n') # copy button component takes newlines as literals
copiable_article_plaintext = plaintext
return page, article_data, page_title, copiable_article_plaintext, table_of_contents