Humble Trader

Tuesday, September 05, 2006

Get HTML - get_html_b.sql

-- ------------------------------------------------------------------------- --
-- --
-- Title: get_html_b.sql --
-- Author: Steve Roach --
-- Date: 24-JUL-2006 --
-- Version: 01.00 --
-- --
-- Description: A package for general control procedures. --
-- --
-- History: --
-- --
-- Date Ver Author Description --
-- --------- ----- ------ -------------------------------------------------- --
-- 25-JUL-06 01.00 SRR First release --
-- --
-- ------------------------------------------------------------------------- --

-- Package body definition.
CREATE OR REPLACE PACKAGE BODY get_html
AS
-- Create types:
TYPE TypeTagStack IS TABLE OF parsed_html.html_component%TYPE;

-- fetch_html Overload 1
-- Wrapper to do fetch with housekeeping.
PROCEDURE fetch_html
(
p_web_page IN VARCHAR2
)
IS
l_run_no NUMBER;
l_status NUMBER;

-- Set up mapping and page access rows for this run.
PROCEDURE initialise_raw_html
(
p_web_page IN VARCHAR2
,p_run_no OUT NUMBER
)
IS
BEGIN
ctl_gen.initialise_mapping('M_H2S_RAW_HTML', p_run_no);

ctl_get_html.init_html_page_access(p_web_page, p_run_no);

EXCEPTION
WHEN OTHERS THEN
RAISE_APPLICATION_ERROR(-20009,
'gen_html.fetch_html(1)-initialise_raw_html Error: [' ||
SQLERRM || ']. p_web_page = [' || p_web_page ||
'], p_run_no = [' || p_run_no || '].');
END initialise_raw_html;

-- Close page access and mapping for this run.
PROCEDURE finalise_raw_html
(
p_run_no IN NUMBER
)
IS
BEGIN
ctl_get_html.html_page_access_update_status(p_run_no, 'FETCHED');

ctl_gen.finalise_mapping(p_run_no);

EXCEPTION
WHEN OTHERS THEN
RAISE_APPLICATION_ERROR(-20010,
'gen_html.fetch_html(1)-finalise_raw_html Error: [' ||
SQLERRM || ']. p_run_no = [' || p_run_no || '].');
END finalise_raw_html;

BEGIN
initialise_raw_html(p_web_page, l_run_no);

fetch_html(p_web_page, l_run_no);

finalise_raw_html(l_run_no);

COMMIT;

EXCEPTION
WHEN OTHERS THEN
RAISE_APPLICATION_ERROR(-20011,
'gen_html.fetch_html(1) Error: [' || SQLERRM ||
']. p_web_page = [' || p_web_page || '].');
END fetch_html;

-- fetch_html Overload 2
-- Fetch web page.
PROCEDURE fetch_html
(
p_web_page IN VARCHAR2
,p_run_no IN NUMBER
)
IS
l_url VARCHAR2(4000);
l_pieces_tab UTL_HTTP.HTML_PIECES;
l_piece_seq NUMBER := 0;
BEGIN
SELECT url
INTO l_url
FROM html_pages
WHERE name = p_web_page;

l_pieces_tab := UTL_HTTP.REQUEST_PIECES(l_url);

FOR l_i IN 1..l_pieces_tab.count
LOOP
l_piece_seq := l_piece_seq + 1;

INSERT INTO raw_html
(
run_no
,piece_seq
,html_piece
,ins_tsp
)
VALUES
(
p_run_no
,l_piece_seq
,l_pieces_tab(l_i)
,SYSDATE
);
END LOOP;

EXCEPTION
WHEN OTHERS THEN
RAISE_APPLICATION_ERROR(-20012,
'gen_html.fetch_html(2): [' || SQLERRM || ']. p_web_page = [' ||
p_web_page || '], p_run_no = [' || p_run_no || '].');
END fetch_html;

-- parse_html Overload 1
-- Wrapper to do parse with housekeeping.
PROCEDURE parse_html
IS
l_run_no NUMBER;
l_status NUMBER;

-- Set up mapping for this run.
PROCEDURE initialise_parse_html
(
p_run_no OUT NUMBER
)
IS
BEGIN
ctl_gen.initialise_mapping('M_S2S_PARSE_HTML', p_run_no);

EXCEPTION
WHEN OTHERS THEN
RAISE_APPLICATION_ERROR(-20013,
'gen_html.parse_html(1)-initialise_parse_html: [' || SQLERRM ||
'].');
END initialise_parse_html;

-- Close mapping for this run.
PROCEDURE finalise_parse_html
(
p_run_no IN NUMBER
)
IS
l_status NUMBER;
BEGIN
ctl_gen.finalise_mapping(p_run_no);

EXCEPTION
WHEN OTHERS THEN
RAISE_APPLICATION_ERROR(-20014,
'gen_html.fetch_html(1)-finalise_raw_html Error: [' ||
SQLERRM || ']. p_run_no = [' || p_run_no || '].');
END finalise_parse_html;

BEGIN
initialise_parse_html(l_run_no);

parse_html(l_run_no);

finalise_parse_html(l_run_no);

EXCEPTION
WHEN OTHERS THEN
RAISE_APPLICATION_ERROR(-20015,
'gen_html.parse_html(1): Error: [' || SQLERRM || '].');
END parse_html;

-- parse_html Overload 2: OWB
-- Parse all page accesses with status of 'FETCHED'.
PROCEDURE parse_html
(
p_parse_run_no IN NUMBER
)
IS
BEGIN
FOR r_html_page_access IN
(SELECT run_no raw_run_no
FROM html_page_access
WHERE status = 'FETCHED'
ORDER BY run_no
FOR UPDATE OF status)
LOOP
ctl_get_html.init_html_parse_passes(
p_parse_run_no, r_html_page_access.raw_run_no);

parse_html(p_parse_run_no, r_html_page_access.raw_run_no);

ctl_get_html.html_parse_passes_update_stat(p_parse_run_no,
r_html_page_access.raw_run_no, 'PARSED');

ctl_get_html.html_page_access_update_status(
r_html_page_access.raw_run_no, 'PARSED');

END LOOP;

EXCEPTION
WHEN OTHERS THEN
RAISE_APPLICATION_ERROR(-20016,
'gen_html.parse_html(2): [' || SQLERRM || ']. p_parse_run_no = ['
|| p_parse_run_no || '].');
END;

-- parse_html Overload 3
-- Parse web page.
PROCEDURE parse_html
(
p_parse_run_no IN NUMBER
,p_raw_run_no IN NUMBER
)
IS
l_indent NUMBER := 0;
l_piece_seq raw_html.piece_seq%TYPE := 0;
l_state NUMBER;
l_char VARCHAR2(1);
l_piece_num NUMBER := 0;
l_tag_buf VARCHAR2(4000);
l_dat_buf VARCHAR2(4000);

t_tag_stack TypeTagStack := TypeTagStack('ROOT');

s_comp_reqd NUMBER := 1; -- Looking for HTML object.
s_in_tag NUMBER := 2; -- In a tag.
s_got_tag NUMBER := 3; -- Have got a tag name.
s_in_dat NUMBER := 4; -- In a data block.

e_closing_tag_not_matched EXCEPTION;
e_tag_opener_in_tag EXCEPTION;
e_tag_opener_in_dat EXCEPTION;
e_no_such_tag EXCEPTION;
e_no_html EXCEPTION;

-- Push a tag on to the tag stack.
PROCEDURE push_tag(p_tag_name IN VARCHAR2)
IS
BEGIN
t_tag_stack.EXTEND;

t_tag_stack(t_tag_stack.COUNT) := p_tag_name;

EXCEPTION
WHEN OTHERS THEN
RAISE_APPLICATION_ERROR(-20017,
'gen_html.parse_html(3)-push_tag: [' || SQLERRM ||
']. p_tag_name = [' || p_tag_name || '].');
END push_tag;

-- Pop a tag from the tag stack.
PROCEDURE pop_tag(p_tag_name IN VARCHAR2)
IS
BEGIN
IF t_tag_stack(t_tag_stack.COUNT) != p_tag_name
THEN
RAISE e_closing_tag_not_matched;
END IF;

t_tag_stack.TRIM;

EXCEPTION
WHEN e_closing_tag_not_matched THEN
RAISE_APPLICATION_ERROR(-20025,
'gen_html.parse_html(3)-pop_tag: Error: An opening tag ' ||
'(t_tag_stack(t_tag_stack.COUNT) = [' ||
t_tag_stack(t_tag_stack.COUNT) || ']) is not matched ' ||
'by the correct closing tag (p_tag_name = [' || p_tag_name ||
']).');
WHEN OTHERS THEN
RAISE_APPLICATION_ERROR(-20018,
'gen_html.parse_html(3)-pop_tag: [' || SQLERRM ||
']. p_tag_name = [' || p_tag_name || '].');
END pop_tag;

-- Process a complete tag.
PROCEDURE process_tag(p_tag IN VARCHAR2)
IS
l_tag VARCHAR2(4000);
l_tag_type NUMBER;
l_tag_name VARCHAR2(2000);
l_treatment html_tags.treatment%TYPE;
l_paired html_tags.paired%TYPE;

c_opening_tag NUMBER := 1;
c_closing_tag NUMBER := 2;
BEGIN
l_tag := UPPER(p_tag);

IF SUBSTR(l_tag, 2, 1) != '/'
THEN
l_tag_type := c_opening_tag;
l_tag_name := SUBSTR(l_tag, 2, LENGTH(l_tag) - 2);

IF SUBSTR(l_tag_name, 1, 2) = 'A '
THEN
l_tag_name := 'A';
END IF;
ELSE
l_tag_type := c_closing_tag;
l_tag_name := SUBSTR(l_tag, 3, LENGTH(l_tag) - 3);
END IF;

BEGIN
SELECT treatment,
paired
INTO l_treatment,
l_paired
FROM html_tags
WHERE name = l_tag_name;
EXCEPTION
WHEN no_data_found
THEN
RAISE e_no_such_tag;
END;

IF l_paired = 'Y'
THEN
IF l_tag_type = c_opening_tag
THEN
push_tag(l_tag_name);
ELSE -- Closing tag.
pop_tag(l_tag_name);
END IF;
END IF;

IF l_treatment = 'KEEP'
THEN
IF l_paired = 'Y'
THEN
IF l_tag_type = c_opening_tag
THEN
l_indent := l_indent + 1;
ELSE
l_indent := l_indent - 1;
END IF;
END IF;

l_piece_seq := l_piece_seq + 1;

INSERT INTO parsed_html
(
parse_run_no
,raw_run_no
,component_seq
,indent
,html_component
,ins_tsp
)
VALUES
(
p_parse_run_no
,p_raw_run_no
,l_piece_seq
,l_indent
,p_tag
,SYSDATE
);
END IF;

EXCEPTION
WHEN e_no_such_tag THEN
RAISE_APPLICATION_ERROR(-20019,
'gen_html.parse_html(3)-process_tag: Error: No such tag: [' ||
p_tag || '].');
WHEN OTHERS THEN
RAISE_APPLICATION_ERROR(-20020,
'gen_html.parse_html(3)-process_tag: Error: [' || SQLERRM ||
']. p_tag = [' || p_tag || '].');
END process_tag;

-- Process a piece of data.
PROCEDURE process_dat(p_data IN VARCHAR2)
IS
l_data VARCHAR2(4000);
BEGIN
-- Strip leading and trailing spaces from data and then put a space
-- on the end.
l_data := RTRIM(LTRIM(p_data)) || ' ';

IF l_data != ' '
AND
l_data != '&' || 'nbsp; '
THEN
l_piece_seq := l_piece_seq + 1;

INSERT INTO parsed_html
(
parse_run_no
,raw_run_no
,component_seq
,indent
,html_component
,ins_tsp
)
VALUES
(
p_parse_run_no
,p_raw_run_no
,l_piece_seq
,l_indent
,l_data
,SYSDATE
);
END IF;

EXCEPTION
WHEN OTHERS THEN
RAISE_APPLICATION_ERROR(-20021,
'gen_html.parse_html(3)-process_dat: Error: [' || SQLERRM ||
']. p_data = [' || p_data || '].');
END process_dat;

BEGIN
l_state := s_comp_reqd;

FOR r_html_piece IN
(SELECT html_piece
FROM raw_html
WHERE run_no = p_raw_run_no
ORDER BY piece_seq)
LOOP
l_piece_num := l_piece_num + 1;

FOR l_piece_ptr IN 1..LENGTH(r_html_piece.html_piece)
LOOP
l_char := SUBSTR(r_html_piece.html_piece, l_piece_ptr, 1);

IF ASCII(l_char) IN (10)
THEN
l_char := ' ';
END IF;

IF l_state = s_comp_reqd -- (1)
THEN
IF l_char = '<'
THEN
l_tag_buf := l_char;
l_state := s_in_tag;
ELSE
l_dat_buf := l_char;
l_state := s_in_dat;
END IF;
ELSIF l_state = s_in_tag -- (2)
THEN
IF l_char = '<'
THEN
RAISE e_tag_opener_in_tag;
ELSIF l_char = ' '
THEN
l_state := s_got_tag;
ELSIF l_char = '>'
THEN
l_tag_buf := l_tag_buf || l_char;
process_tag(l_tag_buf);
l_state := s_comp_reqd;
ELSE
l_tag_buf := l_tag_buf || l_char;
END IF;
ELSIF l_state = s_got_tag -- (3)
THEN
IF l_char = '>'
THEN
l_tag_buf := l_tag_buf || l_char;
process_tag(l_tag_buf);
l_state := s_comp_reqd;
END IF;

IF ASCII(SUBSTR(l_tag_buf, 1, 1)) = 60 -- Tag opener.
AND
UPPER(SUBSTR(l_tag_buf, 2, 1)) = 'A'
THEN
l_tag_buf := l_tag_buf || ' ';
END IF;

IF ASCII(SUBSTR(l_tag_buf, 1, 1)) = 60 -- Tag opener.
AND
UPPER(SUBSTR(l_tag_buf, 2, 2)) = 'A '
THEN
l_tag_buf := l_tag_buf || l_char;
END IF;
ELSIF l_state = s_in_dat -- (4)
THEN
IF l_char = '>'
THEN
RAISE e_tag_opener_in_dat;
ELSIF l_char = '<'
THEN
process_dat(l_dat_buf);
l_tag_buf := l_char;
l_state := s_in_tag;
ELSE
l_dat_buf := l_dat_buf || l_char;
END IF;
END IF;
END LOOP;
END LOOP;

IF l_piece_num = 0
THEN
RAISE e_no_html;
END IF;

EXCEPTION
WHEN e_no_html THEN
RAISE_APPLICATION_ERROR(-20022,
'gen_html.parse_html(3): Error: There is no HTML to parse. ' ||
'p_parse_run_no = [' || p_parse_run_no || '], p_raw_run_no = ['
|| p_raw_run_no || '].');
WHEN e_tag_opener_in_dat THEN
RAISE_APPLICATION_ERROR(-20023,
'gen_html.parse_html(3): Error: There is a tag opener (' ||
CHR(60) || ') in the data. Data buffer = [' || l_dat_buf ||
'].');
WHEN e_tag_opener_in_tag THEN
RAISE_APPLICATION_ERROR(-20024,
'gen_html.parse_html(3): Error: There is a tag opener (' ||
CHR(60) || ') in the tag. Tag buffer = [' || l_tag_buf || '].');
WHEN others THEN
RAISE_APPLICATION_ERROR(-20026,
'gen_html.parse_html(3): Error: [' || SQLERRM ||
']. p_parse_run_no = [' || p_parse_run_no ||
'], p_raw_run_no = [' || p_raw_run_no || '].');
END parse_html;

FUNCTION get_scec
(
p_scec IN
html_special_entity_codes.special_entity_code%TYPE
)
RETURN VARCHAR2
IS
l_char VARCHAR2(1);
BEGIN
SELECT character
INTO l_char
FROM html_special_entity_codes
WHERE special_entity_code = UPPER(p_scec);

RETURN l_char;
EXCEPTION
WHEN no_data_found THEN
RAISE_APPLICATION_ERROR(-20030,
'gen_html.get_scec: Error: Can''t match character to ' ||
'[' || p_scec || '].');
WHEN others THEN
RAISE_APPLICATION_ERROR(-20031,
'gen_html.get_scec: Error: [' || SQLERRM ||
']. p_scec = [' || p_scec || '].');
END get_scec;

FUNCTION replace_scec
(
p_string IN VARCHAR2
)
RETURN VARCHAR2
IS
l_in_string VARCHAR2(4000);
l_scec_start NUMBER;
-- Initialise out-string to null.
l_out_string VARCHAR2(4000) := '';
BEGIN
-- Capture in-string locally.
l_in_string := p_string;

-- Loop forever...
LOOP
-- dbms_output.put_line('l_in_string = ' || l_in_string);
-- dbms_output.put_line('l_out_string = ' || l_out_string);

-- Get the position of the first '&' character.
l_scec_start := INSTR(l_in_string, '&');

-- If there are no more scec to process, exit loop.
EXIT WHEN l_scec_start = 0;

-- Concatenate all in-string characters before the '&' to
-- out-string.
l_out_string := l_out_string ||
SUBSTR(l_in_string, 1, l_scec_start - 1);

-- Strip those characters assigned to out-string from in-string
-- plus the '&'.
l_in_string := SUBSTR(l_in_string, l_scec_start + 1);

-- Concatenate the dereferenced scec to the out-string.
l_out_string := l_out_string ||
get_scec(SUBSTR(l_in_string, 1, INSTR(l_in_string, ';') - 1));

-- Strip the remainder of the scec from the in-string.
l_in_string := SUBSTR(l_in_string, INSTR(l_in_string, ';') + 1);
END LOOP;

-- Concatenate any remaining in-string characters to the out-string.
l_out_string := l_out_string || l_in_string;

-- Return the out-string.
RETURN l_out_string;
EXCEPTION
WHEN others THEN
RAISE_APPLICATION_ERROR(-20032,
'gen_html.replace_scec: Error: [' || SQLERRM ||
']. p_string = [' || p_string || '].');
END;
END get_html;
/

SHOW ERRORS

0 Comments:

Post a Comment

<< Home