Homework no. 7
Splitting the Daily Dispatch
First we import some libraries:
import pandas as pd
import numpy as np
import re
import os
from tqdm import tqdm, tnrange, tqdm_notebook
Then we loop over each file in the appropriate directory, replace all xml-tags by using a regex and save the resulting plaintext in another directory, replacing the .xml-ening with .txt.
directory = 'richmond'
directory_clean_text =  'cleaned_richmond_full'
for filename in tqdm_notebook(os.listdir(directory)[0:100]):
    if filename.endswith(".xml"): 
        with open(os.path.join(directory, filename), 'r',encoding='utf8') as f:
            cleaned_article = re.sub('<[^>]*>', '', f.read())
            with open(os.path.join(directory_clean_text, str(filename.replace('.xml','.txt'))), 'w',encoding='utf8') as towrite:
                towrite.write(cleaned_article)
To get the individual articles we follow the same procedure as above, but split every dispatch into articles on ‘div3’, and then loop over the individual articles:
directory = 'richmond'
directory_clean_text =  'cleaned_richmond'
for filename in tqdm_notebook(os.listdir(directory)[0:100]):
    if filename.endswith(".xml"): 
        with open(os.path.join(directory, filename), 'r',encoding='utf8') as f:
            article_list = re.split('<div3 type=\"article\".*>', f.read())
            article_id = 0
            for a in article_list[1:]:
                article = re.split('<\/div3>',a)[0]
                cleaned_article = re.sub('<[^>]*>', '', article)
                with open(os.path.join(directory_clean_text, str(filename.replace('.xml','') +'_'+ str(article_id)+'.txt')), 'w',encoding='utf8') as towrite:
                    towrite.write(cleaned_article)
                article_id = article_id + 1