-- ------------------------------------------------------------------------- -- -- -- -- Title: extract_html_data_b.sql -- -- Author: Steve Roach -- -- Date: 09-SEP-2006 -- -- Version: 01.00 -- -- -- -- Description: Container for all procedures and functions relating -- -- to extracting data from semi-structured HTML. -- -- -- -- History: -- -- -- -- Date Ver Author Description -- -- --------- ----- ------ -------------------------------------------------- -- -- 09-SEP-06 01.00 SRR First release -- -- -- -- ------------------------------------------------------------------------- --
-- Package header definition. CREATE OR REPLACE PACKAGE BODY extract_html_data AS FUNCTION extract_table ( p_table_no IN NUMBER ,p_web_page IN html_pages.name%TYPE ) RETURN data_list IS l_parse_run_no parsed_html.parse_run_no%TYPE; l_raw_run_no parsed_html.raw_run_no%TYPE; l_table_start_comp_seq parsed_html.component_seq%TYPE; l_table_end_comp_seq parsed_html.component_seq%TYPE; l_table_rows NUMBER; l_table_columns NUMBER;
l_data_list data_list := data_list(); BEGIN -- Get run_no keys (parse and raw) for parsed_html from the web page -- name. SELECT parse_run_no ,raw_run_no INTO l_parse_run_no, l_raw_run_no FROM html_parse_passes WHERE parse_run_no = (SELECT MAX(parse_run_no) FROM html_parse_passes WHERE raw_run_no = (SELECT MAX(run_no) FROM html_page_access WHERE html_page_name = p_web_page AND status = 'PARSED') AND status = 'PARSED') AND raw_run_no = (SELECT MAX(run_no) FROM html_page_access WHERE html_page_name = p_web_page AND status = 'PARSED') AND status = 'PARSED';
-- dbms_output.put_line('l_parse_run_no = ' || l_parse_run_no); -- dbms_output.put_line('l_raw_run_no = ' || l_raw_run_no);
-- Get the start component sequence for the table of interest. SELECT component_seq INTO l_table_start_comp_seq FROM ( SELECT component_seq ,rownum table_no FROM parsed_html WHERE parse_run_no = l_parse_run_no AND raw_run_no = l_raw_run_no AND UPPER(html_component) = CHR(60) || 'TABLE>' ) WHERE table_no = p_table_no;
-- dbms_output.put_line('l_table_start_comp_seq = ' || -- l_table_start_comp_seq);
-- Get the end component sequence for the table of interest. SELECT component_seq INTO l_table_end_comp_seq FROM ( SELECT component_seq ,rownum table_no FROM parsed_html WHERE parse_run_no = l_parse_run_no AND raw_run_no = l_raw_run_no AND UPPER(html_component) = CHR(60) || '/TABLE>' ) WHERE table_no = p_table_no;
-- dbms_output.put_line('l_table_end_comp_seq = ' || -- l_table_end_comp_seq);
-- For all the HTML components inside the range found above... FOR r_html IN (SELECT html_component FROM parsed_html WHERE parse_run_no = l_parse_run_no AND raw_run_no = l_raw_run_no AND component_seq BETWEEN l_table_start_comp_seq + 1 AND l_table_end_comp_seq - 1 ORDER BY component_seq) LOOP -- dbms_output.put_line('r_html.html_component = ' || -- r_html.html_component); -- dbms_output.put_line('UPPER(r_html.html_component) = ' || -- UPPER(r_html.html_component)); -- dbms_output.put_line('CHR(60) || ''TR>'' = ' || -- CHR(60) || 'TR>'); -- dbms_output.put_line('CHR(60) || ''TD>'' = ' || -- CHR(60) || 'TD>'); -- dbms_output.put_line('CHR(60) || ''/TD>'' = ' || -- CHR(60) || '/TD>');
-- If the start of a new table row... IF UPPER(r_html.html_component) = CHR(60) || 'TR>' THEN -- dbms_output.put_line('The start of a new table row.');
-- Create a new list object. l_data_list.EXTEND; -- Else If an opening or closing Table Data tag... ELSIF UPPER(r_html.html_component) = CHR(60) || 'TD>' OR UPPER(r_html.html_component) = CHR(60) || '/TD>' THEN -- dbms_output.put_line('An opening or closing Table Data tag.');
-- If an opening tag and not the first piece of data... IF UPPER(r_html.html_component) = CHR(60) || 'TD>' AND LENGTH(l_data_list(l_data_list.COUNT)) > 0 THEN -- dbms_output.put_line('An opening tag and not the first ' || -- 'piece of data.');
-- Add a comma component seperator. l_data_list(l_data_list.COUNT) := l_data_list(l_data_list.COUNT) || ','; END IF;
-- Add a quote. l_data_list(l_data_list.COUNT) := l_data_list(l_data_list.COUNT) || ''''; -- Else if not a tag... ELSIF SUBSTR(r_html.html_component, 1, 1) != CHR(60) THEN -- dbms_output.put_line('The component is not a tag (and so is ' || -- 'data).');
-- Add the data to the object. l_data_list(l_data_list.COUNT) := l_data_list(l_data_list.COUNT) || r_html.html_component; -- Else do nothing. ELSE -- dbms_output.put_line('Non-processing component - do nothing.');
NULL; END IF;
-- dbms_output.put_line('l_data_list(' || l_data_list.COUNT || -- ') = [' || l_data_list(l_data_list.COUNT) || ']'); END LOOP;
-- Return the list. RETURN l_data_list;
EXCEPTION WHEN OTHERS THEN RAISE_APPLICATION_ERROR(-20001, 'extract_html_data.extract_table Error: [' || SQLERRM || ']. p_table_no = [' || p_table_no || '], p_web_page = [' || p_web_page || '].'); END extract_table; END extract_html_data; /
SHOW ERRORS
|
0 Comments:
Post a Comment
<< Home