Source code for lcat.loading.annotations

#!/usr/bin/env python
"""
BMI 260: Final Project
Load chest CT scan annotations from radiologist xml files.
"""
from collections import namedtuple
import os
import re
import xml.etree.ElementTree as ET

import numpy as np
import skimage
import skimage.measure
import skimage.segmentation

import lcat


# Nodule datatype
Nodule = namedtuple('Nodule', ['nodule_id', 'characteristics', 'origin', 'mask'])

# XML namespace abbreviations
XMLNS = {
    'nih': 'http://www.nih.gov'
}

# Tag name regex
TAG_NAME_RE = re.compile('^{' + XMLNS['nih'] + '}' + '(.+)$')


[docs]def load_radiologist_annotations(dicom_folder, dimensions, sop_instance_uids):
    """
    Load radiologist annotations (namely nodule characteristics and regions) from the xml files
    present in `dicom_folder`. Returns an array of Nodule objects representing all nodules found in
    the radiologist annotations.
    """
    # Create nodules placeholder
    nodules = []

    # Look for XML files
    for filename in os.listdir(dicom_folder):
        if filename.endswith('.xml'):
            # Reconstruct filepath
            filepath = os.path.join(dicom_folder, filename)

            # Load xml file
            tree = ET.parse(filepath)
            root = tree.getroot()

            # Find all nodules
            reads = root.findall('.//nih:readingSession//nih:unblindedReadNodule', XMLNS)

            # For each read
            for read in reads:
                # Extract nodule information
                nodule = get_nodule_information(read, dimensions, sop_instance_uids)

                # Only include >3mm nodules
                if any(dim > 1 for dim in nodule.mask.shape):
                    nodules.append(nodule)

    return nodules


[docs]def get_nodule_information(read, dimensions, sop_instance_uids):
    """
    Given an unblindedReadNodule element, create a Nodule object representing the nodule's
    characteristics and vertices.
    """
    # Get nodule ID
    nodule_id = get_read_nodule_id(read)

    # Get characteristics
    characteristics = get_read_characteristics(read)

    # Get mask
    origin, mask = get_read_mask(read, dimensions, sop_instance_uids)

    return Nodule(nodule_id, characteristics, origin, mask)


[docs]def get_read_nodule_id(read):
    # Find nodule ID element
    nodule_id_elem = read.find('.//nih:noduleID', XMLNS)

    # Return text content
    return nodule_id_elem.text


[docs]def get_read_characteristics(read):
    """
    Get the characteristics from a read as recorded by the radiologist. Returns an empty dictionary
    if no characteristics were recorded.
    """
    # Extract characteristics
    characteristics = {}
    for attribute_elem in read.findall('.//nih:characteristics//*', XMLNS):
        # Get attribute name (removing namespace)
        match = TAG_NAME_RE.match(attribute_elem.tag)
        assert match is not None
        attribute_name = match.group(1)

        # Get attribute value
        attribute_value = int(attribute_elem.text)

        characteristics[attribute_name] = attribute_value

    return characteristics


[docs]def get_read_mask(read, dimensions, sop_instance_uids):
    """
    Get a 3D array representing the region described by the specific read, prefaced by an origin
    specifying its placement in the image (in index coordinates).
    """
    # Get the full mask
    mask = get_mask_region(read, dimensions, sop_instance_uids)

    # Compress to small region with offset
    origin, mask = lcat.util.compress_nodule_mask(mask)

    return origin, mask


[docs]def get_mask_region(read, dimensions, sop_instance_uids):
    """
    Returns a full representation of the region represented by the given nodule read as a mask.
    """
    # Create mask output placeholder
    mask = np.zeros(dimensions, dtype=bool)

    # Create holes queue
    holes = []

    # Identify regions of interest
    for roi_elem in read.findall('.//nih:roi', XMLNS):
        # Check if it's a hole
        if roi_elem.find('.//nih:inclusion', XMLNS).text.upper() == 'FALSE':
            holes.append(roi_elem)
        else:
            mark_region(mask, roi_elem, sop_instance_uids)

    # Create unincluded mask placeholder
    unincluded = np.zeros(dimensions, dtype=bool)

    # Identify hole regions
    for roi_elem in holes:
        mark_region(unincluded, roi_elem, sop_instance_uids)

    # Remove unincluded regions
    mask &= np.logical_not(unincluded)

    return mask


[docs]def mark_region(mask, roi_elem, sop_instance_uids):
    """
    Mark the region of interest encoded by `roi_elem` in `mask`. `sop_instance_uids` is used to
    determine the slices referenced by `roi_elem`.
    """
    # Create mask boundary placeholder
    mask_boundary = np.zeros(mask.shape[:2], dtype=bool)

    # Get Z index
    sop_instance_uid = roi_elem.find('.//nih:imageSOP_UID', XMLNS).text
    z_index = sop_instance_uids.index(sop_instance_uid)

    # Mark boundary points
    for edge_elem in roi_elem.findall('.//nih:edgeMap', XMLNS):
        # Get x and y positions
        x_position = int(edge_elem.find('.//nih:xCoord', XMLNS).text)
        y_position = int(edge_elem.find('.//nih:yCoord', XMLNS).text)

        # Mark boundary in mask
        mask_boundary[x_position, y_position] = 1

    # Fill in region
    mask_regions = skimage.measure.label(mask_boundary, background=-1, connectivity=1)
    mask_center = skimage.segmentation.clear_border(mask_regions)
    mask[:, :, z_index] |= mask_center != 0


[docs]def main():
    """
    Command-line invocation routine.
    """
    import scans
    scan = scans.load_scan('../../data/LIDC-IDRI/LIDC-IDRI-0090')

    import IPython
    IPython.embed()


if __name__ == '__main__':
    main()