sec_parser.processing_steps.page_header_classifier
Classes
Module Contents
- class sec_parser.processing_steps.page_header_classifier.PageHeaderCandidate
- TEXT_LENGTH_THRESHOLD = 100
- OCCURRENCE_THRESHOLD = 5
- MOST_COMMON_CANDIDATE_LIMIT = None
- text: str
- class sec_parser.processing_steps.page_header_classifier.PageHeaderClassifier(types_to_process: set[type[sec_parser.semantic_elements.abstract_semantic_element.AbstractSemanticElement]] | None = None, types_to_exclude: set[type[sec_parser.semantic_elements.abstract_semantic_element.AbstractSemanticElement]] | None = None)
-
- _NUM_ITERATIONS = 2
- _element_to_page_header_candidate: dict[sec_parser.semantic_elements.abstract_semantic_element.AbstractSemanticElement, PageHeaderCandidate]
- _candidate_count: collections.Counter[PageHeaderCandidate]
- _most_common_candidates: dict[PageHeaderCandidate, int] | None = None
- _process_element(element: sec_parser.semantic_elements.abstract_semantic_element.AbstractSemanticElement, context: sec_parser.processing_steps.abstract_classes.abstract_elementwise_processing_step.ElementProcessingContext) sec_parser.semantic_elements.abstract_semantic_element.AbstractSemanticElement
- _find_page_header_candidates(element: sec_parser.semantic_elements.abstract_semantic_element.AbstractSemanticElement) None
- _classify_elements(element: sec_parser.semantic_elements.abstract_semantic_element.AbstractSemanticElement) sec_parser.semantic_elements.abstract_semantic_element.AbstractSemanticElement
- _get_most_common_candidates() dict[PageHeaderCandidate, int]