sec_parser.processing_steps.page_header_classifier

Classes

PageHeaderCandidate

PageHeaderClassifier

Module Contents

class sec_parser.processing_steps.page_header_classifier.PageHeaderCandidate
TEXT_LENGTH_THRESHOLD = 100
OCCURRENCE_THRESHOLD = 5
MOST_COMMON_CANDIDATE_LIMIT = None
text: str
style: sec_parser.semantic_elements.highlighted_text_element.TextStyle | None
class sec_parser.processing_steps.page_header_classifier.PageHeaderClassifier(types_to_process: set[type[sec_parser.semantic_elements.abstract_semantic_element.AbstractSemanticElement]] | None = None, types_to_exclude: set[type[sec_parser.semantic_elements.abstract_semantic_element.AbstractSemanticElement]] | None = None)

Bases: sec_parser.processing_steps.abstract_classes.abstract_elementwise_processing_step.AbstractElementwiseProcessingStep

_NUM_ITERATIONS = 2
_element_to_page_header_candidate: dict[sec_parser.semantic_elements.abstract_semantic_element.AbstractSemanticElement, PageHeaderCandidate]
_candidate_count: collections.Counter[PageHeaderCandidate]
_most_common_candidates: dict[PageHeaderCandidate, int] | None = None
_process_element(element: sec_parser.semantic_elements.abstract_semantic_element.AbstractSemanticElement, context: sec_parser.processing_steps.abstract_classes.abstract_elementwise_processing_step.ElementProcessingContext) sec_parser.semantic_elements.abstract_semantic_element.AbstractSemanticElement
_find_page_header_candidates(element: sec_parser.semantic_elements.abstract_semantic_element.AbstractSemanticElement) None
_classify_elements(element: sec_parser.semantic_elements.abstract_semantic_element.AbstractSemanticElement) sec_parser.semantic_elements.abstract_semantic_element.AbstractSemanticElement
_get_most_common_candidates() dict[PageHeaderCandidate, int]