-- ------------------------------------------------------------------------- -- -- -- -- Title: get_html_b.sql -- -- Author: Steve Roach -- -- Date: 24-JUL-2006 -- -- Version: 01.00 -- -- -- -- Description: A package for general control procedures. -- -- -- -- History: -- -- -- -- Date Ver Author Description -- -- --------- ----- ------ -------------------------------------------------- -- -- 25-JUL-06 01.00 SRR First release -- -- -- -- ------------------------------------------------------------------------- --
-- Package body definition. CREATE OR REPLACE PACKAGE BODY get_html AS -- Create types: TYPE TypeTagStack IS TABLE OF parsed_html.html_component%TYPE;
-- fetch_html Overload 1 -- Wrapper to do fetch with housekeeping. PROCEDURE fetch_html ( p_web_page IN VARCHAR2 ) IS l_run_no NUMBER; l_status NUMBER;
-- Set up mapping and page access rows for this run. PROCEDURE initialise_raw_html ( p_web_page IN VARCHAR2 ,p_run_no OUT NUMBER ) IS BEGIN ctl_gen.initialise_mapping('M_H2S_RAW_HTML', p_run_no); ctl_get_html.init_html_page_access(p_web_page, p_run_no);
EXCEPTION WHEN OTHERS THEN RAISE_APPLICATION_ERROR(-20009, 'gen_html.fetch_html(1)-initialise_raw_html Error: [' || SQLERRM || ']. p_web_page = [' || p_web_page || '], p_run_no = [' || p_run_no || '].'); END initialise_raw_html;
-- Close page access and mapping for this run. PROCEDURE finalise_raw_html ( p_run_no IN NUMBER ) IS BEGIN ctl_get_html.html_page_access_update_status(p_run_no, 'FETCHED'); ctl_gen.finalise_mapping(p_run_no);
EXCEPTION WHEN OTHERS THEN RAISE_APPLICATION_ERROR(-20010, 'gen_html.fetch_html(1)-finalise_raw_html Error: [' || SQLERRM || ']. p_run_no = [' || p_run_no || '].'); END finalise_raw_html;
BEGIN initialise_raw_html(p_web_page, l_run_no);
fetch_html(p_web_page, l_run_no);
finalise_raw_html(l_run_no);
COMMIT;
EXCEPTION WHEN OTHERS THEN RAISE_APPLICATION_ERROR(-20011, 'gen_html.fetch_html(1) Error: [' || SQLERRM || ']. p_web_page = [' || p_web_page || '].'); END fetch_html;
-- fetch_html Overload 2 -- Fetch web page. PROCEDURE fetch_html ( p_web_page IN VARCHAR2 ,p_run_no IN NUMBER ) IS l_url VARCHAR2(4000); l_pieces_tab UTL_HTTP.HTML_PIECES; l_piece_seq NUMBER := 0; BEGIN SELECT url INTO l_url FROM html_pages WHERE name = p_web_page;
l_pieces_tab := UTL_HTTP.REQUEST_PIECES(l_url);
FOR l_i IN 1..l_pieces_tab.count LOOP l_piece_seq := l_piece_seq + 1;
INSERT INTO raw_html ( run_no ,piece_seq ,html_piece ,ins_tsp ) VALUES ( p_run_no ,l_piece_seq ,l_pieces_tab(l_i) ,SYSDATE ); END LOOP;
EXCEPTION WHEN OTHERS THEN RAISE_APPLICATION_ERROR(-20012, 'gen_html.fetch_html(2): [' || SQLERRM || ']. p_web_page = [' || p_web_page || '], p_run_no = [' || p_run_no || '].'); END fetch_html;
-- parse_html Overload 1 -- Wrapper to do parse with housekeeping. PROCEDURE parse_html IS l_run_no NUMBER; l_status NUMBER;
-- Set up mapping for this run. PROCEDURE initialise_parse_html ( p_run_no OUT NUMBER ) IS BEGIN ctl_gen.initialise_mapping('M_S2S_PARSE_HTML', p_run_no);
EXCEPTION WHEN OTHERS THEN RAISE_APPLICATION_ERROR(-20013, 'gen_html.parse_html(1)-initialise_parse_html: [' || SQLERRM || '].'); END initialise_parse_html;
-- Close mapping for this run. PROCEDURE finalise_parse_html ( p_run_no IN NUMBER ) IS l_status NUMBER; BEGIN ctl_gen.finalise_mapping(p_run_no);
EXCEPTION WHEN OTHERS THEN RAISE_APPLICATION_ERROR(-20014, 'gen_html.fetch_html(1)-finalise_raw_html Error: [' || SQLERRM || ']. p_run_no = [' || p_run_no || '].'); END finalise_parse_html;
BEGIN initialise_parse_html(l_run_no);
parse_html(l_run_no);
finalise_parse_html(l_run_no);
EXCEPTION WHEN OTHERS THEN RAISE_APPLICATION_ERROR(-20015, 'gen_html.parse_html(1): Error: [' || SQLERRM || '].'); END parse_html;
-- parse_html Overload 2: OWB -- Parse all page accesses with status of 'FETCHED'. PROCEDURE parse_html ( p_parse_run_no IN NUMBER ) IS BEGIN FOR r_html_page_access IN (SELECT run_no raw_run_no FROM html_page_access WHERE status = 'FETCHED' ORDER BY run_no FOR UPDATE OF status) LOOP ctl_get_html.init_html_parse_passes( p_parse_run_no, r_html_page_access.raw_run_no);
parse_html(p_parse_run_no, r_html_page_access.raw_run_no);
ctl_get_html.html_parse_passes_update_stat(p_parse_run_no, r_html_page_access.raw_run_no, 'PARSED');
ctl_get_html.html_page_access_update_status( r_html_page_access.raw_run_no, 'PARSED');
END LOOP;
EXCEPTION WHEN OTHERS THEN RAISE_APPLICATION_ERROR(-20016, 'gen_html.parse_html(2): [' || SQLERRM || ']. p_parse_run_no = [' || p_parse_run_no || '].'); END;
-- parse_html Overload 3 -- Parse web page. PROCEDURE parse_html ( p_parse_run_no IN NUMBER ,p_raw_run_no IN NUMBER ) IS l_indent NUMBER := 0; l_piece_seq raw_html.piece_seq%TYPE := 0; l_state NUMBER; l_char VARCHAR2(1); l_piece_num NUMBER := 0; l_tag_buf VARCHAR2(4000); l_dat_buf VARCHAR2(4000);
t_tag_stack TypeTagStack := TypeTagStack('ROOT');
s_comp_reqd NUMBER := 1; -- Looking for HTML object. s_in_tag NUMBER := 2; -- In a tag. s_got_tag NUMBER := 3; -- Have got a tag name. s_in_dat NUMBER := 4; -- In a data block.
e_closing_tag_not_matched EXCEPTION; e_tag_opener_in_tag EXCEPTION; e_tag_opener_in_dat EXCEPTION; e_no_such_tag EXCEPTION; e_no_html EXCEPTION;
-- Push a tag on to the tag stack. PROCEDURE push_tag(p_tag_name IN VARCHAR2) IS BEGIN t_tag_stack.EXTEND;
t_tag_stack(t_tag_stack.COUNT) := p_tag_name;
EXCEPTION WHEN OTHERS THEN RAISE_APPLICATION_ERROR(-20017, 'gen_html.parse_html(3)-push_tag: [' || SQLERRM || ']. p_tag_name = [' || p_tag_name || '].'); END push_tag;
-- Pop a tag from the tag stack. PROCEDURE pop_tag(p_tag_name IN VARCHAR2) IS BEGIN IF t_tag_stack(t_tag_stack.COUNT) != p_tag_name THEN RAISE e_closing_tag_not_matched; END IF;
t_tag_stack.TRIM;
EXCEPTION WHEN e_closing_tag_not_matched THEN RAISE_APPLICATION_ERROR(-20025, 'gen_html.parse_html(3)-pop_tag: Error: An opening tag ' || '(t_tag_stack(t_tag_stack.COUNT) = [' || t_tag_stack(t_tag_stack.COUNT) || ']) is not matched ' || 'by the correct closing tag (p_tag_name = [' || p_tag_name || ']).'); WHEN OTHERS THEN RAISE_APPLICATION_ERROR(-20018, 'gen_html.parse_html(3)-pop_tag: [' || SQLERRM || ']. p_tag_name = [' || p_tag_name || '].'); END pop_tag;
-- Process a complete tag. PROCEDURE process_tag(p_tag IN VARCHAR2) IS l_tag VARCHAR2(4000); l_tag_type NUMBER; l_tag_name VARCHAR2(2000); l_treatment html_tags.treatment%TYPE; l_paired html_tags.paired%TYPE;
c_opening_tag NUMBER := 1; c_closing_tag NUMBER := 2; BEGIN l_tag := UPPER(p_tag);
IF SUBSTR(l_tag, 2, 1) != '/' THEN l_tag_type := c_opening_tag; l_tag_name := SUBSTR(l_tag, 2, LENGTH(l_tag) - 2);
IF SUBSTR(l_tag_name, 1, 2) = 'A ' THEN l_tag_name := 'A'; END IF; ELSE l_tag_type := c_closing_tag; l_tag_name := SUBSTR(l_tag, 3, LENGTH(l_tag) - 3); END IF;
BEGIN SELECT treatment, paired INTO l_treatment, l_paired FROM html_tags WHERE name = l_tag_name; EXCEPTION WHEN no_data_found THEN RAISE e_no_such_tag; END;
IF l_paired = 'Y' THEN IF l_tag_type = c_opening_tag THEN push_tag(l_tag_name); ELSE -- Closing tag. pop_tag(l_tag_name); END IF; END IF;
IF l_treatment = 'KEEP' THEN IF l_paired = 'Y' THEN IF l_tag_type = c_opening_tag THEN l_indent := l_indent + 1; ELSE l_indent := l_indent - 1; END IF; END IF;
l_piece_seq := l_piece_seq + 1;
INSERT INTO parsed_html ( parse_run_no ,raw_run_no ,component_seq ,indent ,html_component ,ins_tsp ) VALUES ( p_parse_run_no ,p_raw_run_no ,l_piece_seq ,l_indent ,p_tag ,SYSDATE ); END IF;
EXCEPTION WHEN e_no_such_tag THEN RAISE_APPLICATION_ERROR(-20019, 'gen_html.parse_html(3)-process_tag: Error: No such tag: [' || p_tag || '].'); WHEN OTHERS THEN RAISE_APPLICATION_ERROR(-20020, 'gen_html.parse_html(3)-process_tag: Error: [' || SQLERRM || ']. p_tag = [' || p_tag || '].'); END process_tag;
-- Process a piece of data. PROCEDURE process_dat(p_data IN VARCHAR2) IS l_data VARCHAR2(4000); BEGIN -- Strip leading and trailing spaces from data and then put a space -- on the end. l_data := RTRIM(LTRIM(p_data)) || ' ';
IF l_data != ' ' AND l_data != '&' || 'nbsp; ' THEN l_piece_seq := l_piece_seq + 1;
INSERT INTO parsed_html ( parse_run_no ,raw_run_no ,component_seq ,indent ,html_component ,ins_tsp ) VALUES ( p_parse_run_no ,p_raw_run_no ,l_piece_seq ,l_indent ,l_data ,SYSDATE ); END IF;
EXCEPTION WHEN OTHERS THEN RAISE_APPLICATION_ERROR(-20021, 'gen_html.parse_html(3)-process_dat: Error: [' || SQLERRM || ']. p_data = [' || p_data || '].'); END process_dat;
BEGIN l_state := s_comp_reqd;
FOR r_html_piece IN (SELECT html_piece FROM raw_html WHERE run_no = p_raw_run_no ORDER BY piece_seq) LOOP l_piece_num := l_piece_num + 1;
FOR l_piece_ptr IN 1..LENGTH(r_html_piece.html_piece) LOOP l_char := SUBSTR(r_html_piece.html_piece, l_piece_ptr, 1);
IF ASCII(l_char) IN (10) THEN l_char := ' '; END IF;
IF l_state = s_comp_reqd -- (1) THEN IF l_char = '<' THEN l_tag_buf := l_char; l_state := s_in_tag; ELSE l_dat_buf := l_char; l_state := s_in_dat; END IF; ELSIF l_state = s_in_tag -- (2) THEN IF l_char = '<' THEN RAISE e_tag_opener_in_tag; ELSIF l_char = ' ' THEN l_state := s_got_tag; ELSIF l_char = '>' THEN l_tag_buf := l_tag_buf || l_char; process_tag(l_tag_buf); l_state := s_comp_reqd; ELSE l_tag_buf := l_tag_buf || l_char; END IF; ELSIF l_state = s_got_tag -- (3) THEN IF l_char = '>' THEN l_tag_buf := l_tag_buf || l_char; process_tag(l_tag_buf); l_state := s_comp_reqd; END IF;
IF ASCII(SUBSTR(l_tag_buf, 1, 1)) = 60 -- Tag opener. AND UPPER(SUBSTR(l_tag_buf, 2, 1)) = 'A' THEN l_tag_buf := l_tag_buf || ' '; END IF;
IF ASCII(SUBSTR(l_tag_buf, 1, 1)) = 60 -- Tag opener. AND UPPER(SUBSTR(l_tag_buf, 2, 2)) = 'A ' THEN l_tag_buf := l_tag_buf || l_char; END IF; ELSIF l_state = s_in_dat -- (4) THEN IF l_char = '>' THEN RAISE e_tag_opener_in_dat; ELSIF l_char = '<' THEN process_dat(l_dat_buf); l_tag_buf := l_char; l_state := s_in_tag; ELSE l_dat_buf := l_dat_buf || l_char; END IF; END IF; END LOOP; END LOOP;
IF l_piece_num = 0 THEN RAISE e_no_html; END IF;
EXCEPTION WHEN e_no_html THEN RAISE_APPLICATION_ERROR(-20022, 'gen_html.parse_html(3): Error: There is no HTML to parse. ' || 'p_parse_run_no = [' || p_parse_run_no || '], p_raw_run_no = [' || p_raw_run_no || '].'); WHEN e_tag_opener_in_dat THEN RAISE_APPLICATION_ERROR(-20023, 'gen_html.parse_html(3): Error: There is a tag opener (' || CHR(60) || ') in the data. Data buffer = [' || l_dat_buf || '].'); WHEN e_tag_opener_in_tag THEN RAISE_APPLICATION_ERROR(-20024, 'gen_html.parse_html(3): Error: There is a tag opener (' || CHR(60) || ') in the tag. Tag buffer = [' || l_tag_buf || '].'); WHEN others THEN RAISE_APPLICATION_ERROR(-20026, 'gen_html.parse_html(3): Error: [' || SQLERRM || ']. p_parse_run_no = [' || p_parse_run_no || '], p_raw_run_no = [' || p_raw_run_no || '].'); END parse_html;
FUNCTION get_scec ( p_scec IN html_special_entity_codes.special_entity_code%TYPE ) RETURN VARCHAR2 IS l_char VARCHAR2(1); BEGIN SELECT character INTO l_char FROM html_special_entity_codes WHERE special_entity_code = UPPER(p_scec);
RETURN l_char; EXCEPTION WHEN no_data_found THEN RAISE_APPLICATION_ERROR(-20030, 'gen_html.get_scec: Error: Can''t match character to ' || '[' || p_scec || '].'); WHEN others THEN RAISE_APPLICATION_ERROR(-20031, 'gen_html.get_scec: Error: [' || SQLERRM || ']. p_scec = [' || p_scec || '].'); END get_scec;
FUNCTION replace_scec ( p_string IN VARCHAR2 ) RETURN VARCHAR2 IS l_in_string VARCHAR2(4000); l_scec_start NUMBER; -- Initialise out-string to null. l_out_string VARCHAR2(4000) := ''; BEGIN -- Capture in-string locally. l_in_string := p_string;
-- Loop forever... LOOP -- dbms_output.put_line('l_in_string = ' || l_in_string); -- dbms_output.put_line('l_out_string = ' || l_out_string);
-- Get the position of the first '&' character. l_scec_start := INSTR(l_in_string, '&'); -- If there are no more scec to process, exit loop. EXIT WHEN l_scec_start = 0;
-- Concatenate all in-string characters before the '&' to -- out-string. l_out_string := l_out_string || SUBSTR(l_in_string, 1, l_scec_start - 1);
-- Strip those characters assigned to out-string from in-string -- plus the '&'. l_in_string := SUBSTR(l_in_string, l_scec_start + 1);
-- Concatenate the dereferenced scec to the out-string. l_out_string := l_out_string || get_scec(SUBSTR(l_in_string, 1, INSTR(l_in_string, ';') - 1));
-- Strip the remainder of the scec from the in-string. l_in_string := SUBSTR(l_in_string, INSTR(l_in_string, ';') + 1); END LOOP;
-- Concatenate any remaining in-string characters to the out-string. l_out_string := l_out_string || l_in_string;
-- Return the out-string. RETURN l_out_string; EXCEPTION WHEN others THEN RAISE_APPLICATION_ERROR(-20032, 'gen_html.replace_scec: Error: [' || SQLERRM || ']. p_string = [' || p_string || '].'); END; END get_html; /
SHOW ERRORS
|
0 Comments:
Post a Comment
<< Home