Textmining PubMed Abstracts with LeadMine

The US National Library of Medicine provide an annual baseline of PubMed Abstracts freely available for download, along with daily updates throughout the year. You can download the baseline with a command such as the following, if you have wget available:

wget --mirror --accept "*.xml.gz" ftp://ftp.ncbi.nlm.nih.gov/pubmed/baseline/

Once downloaded, you will end up with close to 100 gzipped xml files, each one containing a large number of abstracts along with bibliographic data.

While it is possible to use the LeadMine library and an XML reader to textmine these any way one wants, it is convenient to use LeadMine’s command-line application to do the textmining if possible, as this has built-in reporting of results, makes use of multiple processors, and well, you don’t need to write any Java to use it. However, without transforming these data somehow, the command-line application is not going to work very well as (a) it process each entire XML file in one go, with resulting large memory usage and a lack of correspondence between a particular PMID and its results, (b) using more than one processor simply exacerbates the memory problem, and (c) while LeadMine handles XML without problems, it will inevitably end up textmining information that is not in the abstract (e.g. part of a journal name).

Fortunately, it is not difficult to transform the data into a more easily digestible form. For each gzipped XML file, the Python script below generates a zip containing a large number of small XML files, each one corresponding to a single PubMed abstract. Bibliographic information is included in XML attributes so that the only text that will be textmined is that of the title and abstract (indicated by T and A in the results below). Once the script is run, LeadMine can be used to textmine as shown below. Here I focus on diseases and ClinicalTrials.gov (NCT) numbers; note that while PubMed provide manually curated entries for both of these in the original XML, they are missing from the most recent abstracts (presumably due to a time lag).

java -jar leadmine-3.12.jar -c diseases_trails.cfg -tsv -t 12 -R D:\PubMedAbstracts\zipfiles > diseases_trials.txt

Fifteen minutes later (on my machine), I get an output file that includes the following where the PMID (and version) appears in the first column:

DocName BegIndex        EndIndex        SectionType     EntityType      PossiblyCorrectedText   EntityText      CorrectionDistance      ResolvedForm
29170069.1	1272	1279	A	Disease	phobias	phobias	0	D010698
29170069.1	1290	1307	A	Disease	anxiety disorders	anxiety disorders	0	D001008
29170072.1	325	358	A	Disease	Exocrine pancreatic insufficiency	Exocrine pancreatic insufficiency	0	D010188
29170073.1	1856	1860	A	Disease	pain	pain	0	D010146
29170073.1	2087	2098	A	Trial	NCT02683707	NCT02683707	0	
29170074.1	334	349	A	Disease	cystic fibrosis	cystic fibrosis	0	D003550
29170075.1	127	146	T	Disease	depressive symptoms	depressive symptoms	0	D003866
29170075.1	419	438	A	Disease	depressive symptoms	depressive symptoms	0	D003866
29170075.1	476	495	A	Disease	depressive symptoms	depressive symptoms	0	D003866
29170075.1	1579	1598	A	Disease	depressive symptoms	depressive symptoms	0	D003866
29170075.1	2191	2202	A	Trial	NCT02860741	NCT02860741	0	
29170076.1	198	221	T	Disease	end-stage renal disease	end-stage renal disease	0	D007676
29170076.1	240	262	A	Disease	Cardiovascular disease	Cardiovascular disease	0	D002318
29170076.1	320	342	A	Disease	chronic kidney disease	chronic kidney disease	0	D051436
29170076.1	485	507	A	Disease	vascular calcification	vascular calcification	0	D061205
29170076.1	583	588	A	Disease	tumor	tumor	0	D009369
29170076.1	589	597	A	Disease	necrosis	necrosis	0	D009336
29170076.1	765	788	A	Disease	end-stage renal disease	end-stage renal disease	0	D007676

Python script

import os
import sys
import glob
import gzip
import zipfile
import multiprocessing as mp
import xml.etree.ElementTree as ET

class Details:
    def __init__(self, title, abstract, year, volume, journal, page):
        self.title = title
        self.abstract = abstract
        self.year = year
        self.volume = volume
        self.journal = journal
        self.page = page
    def __repr__(self):
        return "%s _%s_ *%s* _%s_ %s\n\nAbstract: %s" % (self.title, self.journal, self.year, self.volume, self.page, self.abstract)

def getelements(filename_or_file, tag):
    """Yield *tag* elements from *filename_or_file* xml incrementaly."""
    context = iter(ET.iterparse(filename_or_file, events=('start', 'end')))
    _, root = next(context) # get root element
    for event, elem in context:
        if event == 'end' and elem.tag == tag:
            yield elem
            root.clear() # free memory

def getText(node):
    if node is None:
        return ""
    t = node.text
    return "" if t is None else t

def extract(medline):
    article = medline.find("Article")
    title = "".join(article.find("ArticleTitle").itertext())
    abstractNode = article.find("Abstract")
    abstract = ""
    if abstractNode is not None:
        abstract = []
        for abstractText in abstractNode.findall("AbstractText"):
            abstract.append("".join(abstractText.itertext()))
        abstract = " ".join(abstract)
    page = getText(article.find("Pagination/MedlinePgn"))
    journal = article.find("Journal")
    journalissue = journal.find("JournalIssue")
    volume = getText(journalissue.find("Volume"))
    year = getText(journalissue.find("PubDate/Year"))
    journaltitle = getText(journal.find("Title"))
    return Details(title, abstract, year, volume, journaltitle, page)

class PubMed:
    def __init__(self, fname):
        self.iter = self.getArticles(gzip.open(fname))

    def getArticles(self, mfile):
        for elem in getelements(mfile, "PubmedArticle"):
            medline = elem.find("MedlineCitation")
            pmidnode = medline.find("PMID")
            pmid = pmidnode.text
            version = pmidnode.get('Version')
            yield pmid, version, medline

    def getAll(self):
        for pmid, version, medline in self.iter:
            yield pmid, version, extract(medline)

    def getArticleDetails(self, mpmid):
        for pmid, _, medline in self.iter:
            if mpmid and mpmid != pmid: continue
            return extract(medline)

def handleonefile(inpname):
    pm = PubMed(inpname)
    basename = os.path.basename(inpname).split(".")[0]
    outname = os.path.join("reformatted", basename+".zip")
    if os.path.isfile(outname):
        print("SKIPPING: " + outname)
        return
    print("REFORMATTING: " + outname)
    idxfile = os.path.join("reformatted", basename+".idx")
    with zipfile.ZipFile(outname, mode="w", compression=zipfile.ZIP_DEFLATED) as out:
        with open(idxfile, "w") as outidx:
            for pmid, version, article in pm.getAll():
                article_elem = ET.Element("article", {
                    "pmid": pmid,
                    "version": version,
                    "journal": article.journal,
                    "year": article.year,
                    "volume": article.volume,
                    "page": article.page
                    })
                title = ET.SubElement(article_elem, "title")
                title.text = article.title
                abstract = ET.SubElement(article_elem, "abstract")
                abstract.text = article.abstract

                xmlfile = f"{pmid}.{version}.xml"

                xmldeclaration = b'\n'
                try:
                    xmltext = xmldeclaration + ET.tostring(article_elem)
                except:
                    print(article)
                out.writestr(xmlfile, xmltext)
                outidx.write(xmlfile[:-4] + "\n")

if __name__ == "__main__":
    POOLSIZE  = 36 # number of CPUs
    pool = mp.Pool(POOLSIZE)
    if not os.path.isdir("reformatted"):
        os.mkdir("reformatted")
    fnames = glob.glob(os.path.join("abstracts", "*.xml.gz"))
    fnames.extend(glob.glob(os.path.join("dailyupdates", "*.xml.gz")))
    # Note that the filenames continue in numbering from one directory
    # to the other (but do not overlap)

    for x in pool.imap_unordered(handleonefile, fnames, 1):
        pass