Humble Trader

Saturday, September 09, 2006

Extract HTML Data - extract_html_data_b.sql

-- ------------------------------------------------------------------------- --
-- --
-- Title: extract_html_data_b.sql --
-- Author: Steve Roach --
-- Date: 09-SEP-2006 --
-- Version: 01.00 --
-- --
-- Description: Container for all procedures and functions relating --
-- to extracting data from semi-structured HTML. --
-- --
-- History: --
-- --
-- Date Ver Author Description --
-- --------- ----- ------ -------------------------------------------------- --
-- 09-SEP-06 01.00 SRR First release --
-- --
-- ------------------------------------------------------------------------- --

-- Package header definition.
CREATE OR REPLACE PACKAGE BODY extract_html_data
AS
FUNCTION extract_table
(
p_table_no IN NUMBER
,p_web_page IN html_pages.name%TYPE
)
RETURN data_list
IS
l_parse_run_no parsed_html.parse_run_no%TYPE;
l_raw_run_no parsed_html.raw_run_no%TYPE;
l_table_start_comp_seq parsed_html.component_seq%TYPE;
l_table_end_comp_seq parsed_html.component_seq%TYPE;
l_table_rows NUMBER;
l_table_columns NUMBER;

l_data_list data_list := data_list();
BEGIN
-- Get run_no keys (parse and raw) for parsed_html from the web page
-- name.
SELECT parse_run_no
,raw_run_no
INTO l_parse_run_no,
l_raw_run_no
FROM html_parse_passes
WHERE parse_run_no =
(SELECT MAX(parse_run_no)
FROM html_parse_passes
WHERE raw_run_no =
(SELECT MAX(run_no)
FROM html_page_access
WHERE html_page_name = p_web_page
AND status = 'PARSED')
AND status = 'PARSED')
AND raw_run_no =
(SELECT MAX(run_no)
FROM html_page_access
WHERE html_page_name = p_web_page
AND status = 'PARSED')
AND status = 'PARSED';

-- dbms_output.put_line('l_parse_run_no = ' || l_parse_run_no);
-- dbms_output.put_line('l_raw_run_no = ' || l_raw_run_no);

-- Get the start component sequence for the table of interest.
SELECT component_seq
INTO l_table_start_comp_seq
FROM
(
SELECT component_seq
,rownum table_no
FROM parsed_html
WHERE parse_run_no = l_parse_run_no
AND raw_run_no = l_raw_run_no
AND UPPER(html_component) = CHR(60) || 'TABLE>'
)
WHERE table_no = p_table_no;

-- dbms_output.put_line('l_table_start_comp_seq = ' ||
-- l_table_start_comp_seq);

-- Get the end component sequence for the table of interest.
SELECT component_seq
INTO l_table_end_comp_seq
FROM
(
SELECT component_seq
,rownum table_no
FROM parsed_html
WHERE parse_run_no = l_parse_run_no
AND raw_run_no = l_raw_run_no
AND UPPER(html_component) = CHR(60) || '/TABLE>'
)
WHERE table_no = p_table_no;

-- dbms_output.put_line('l_table_end_comp_seq = ' ||
-- l_table_end_comp_seq);

-- For all the HTML components inside the range found above...
FOR r_html IN
(SELECT html_component
FROM parsed_html
WHERE parse_run_no = l_parse_run_no
AND raw_run_no = l_raw_run_no
AND component_seq BETWEEN l_table_start_comp_seq + 1
AND l_table_end_comp_seq - 1
ORDER BY component_seq)
LOOP
-- dbms_output.put_line('r_html.html_component = ' ||
-- r_html.html_component);
-- dbms_output.put_line('UPPER(r_html.html_component) = ' ||
-- UPPER(r_html.html_component));
-- dbms_output.put_line('CHR(60) || ''TR>'' = ' ||
-- CHR(60) || 'TR>');
-- dbms_output.put_line('CHR(60) || ''TD>'' = ' ||
-- CHR(60) || 'TD>');
-- dbms_output.put_line('CHR(60) || ''/TD>'' = ' ||
-- CHR(60) || '/TD>');

-- If the start of a new table row...
IF UPPER(r_html.html_component) = CHR(60) || 'TR>'
THEN
-- dbms_output.put_line('The start of a new table row.');

-- Create a new list object.
l_data_list.EXTEND;
-- Else If an opening or closing Table Data tag...
ELSIF UPPER(r_html.html_component) = CHR(60) || 'TD>'
OR
UPPER(r_html.html_component) = CHR(60) || '/TD>'
THEN
-- dbms_output.put_line('An opening or closing Table Data tag.');

-- If an opening tag and not the first piece of data...
IF UPPER(r_html.html_component) = CHR(60) || 'TD>'
AND
LENGTH(l_data_list(l_data_list.COUNT)) > 0
THEN
-- dbms_output.put_line('An opening tag and not the first ' ||
-- 'piece of data.');

-- Add a comma component seperator.
l_data_list(l_data_list.COUNT) :=
l_data_list(l_data_list.COUNT) || ',';
END IF;

-- Add a quote.
l_data_list(l_data_list.COUNT) :=
l_data_list(l_data_list.COUNT) || '''';
-- Else if not a tag...
ELSIF SUBSTR(r_html.html_component, 1, 1) != CHR(60)
THEN
-- dbms_output.put_line('The component is not a tag (and so is ' ||
-- 'data).');

-- Add the data to the object.
l_data_list(l_data_list.COUNT) :=
l_data_list(l_data_list.COUNT) || r_html.html_component;
-- Else do nothing.
ELSE
-- dbms_output.put_line('Non-processing component - do nothing.');

NULL;
END IF;

-- dbms_output.put_line('l_data_list(' || l_data_list.COUNT ||
-- ') = [' || l_data_list(l_data_list.COUNT) || ']');
END LOOP;

-- Return the list.
RETURN l_data_list;

EXCEPTION
WHEN OTHERS THEN
RAISE_APPLICATION_ERROR(-20001,
'extract_html_data.extract_table Error: [' || SQLERRM ||
']. p_table_no = [' || p_table_no || '], p_web_page = [' ||
p_web_page || '].');
END extract_table;
END extract_html_data;
/

SHOW ERRORS

0 Comments:

Post a Comment

<< Home