- Encontrar o vocabulário comum de 1.500 palavras entre 2 livros;
- Encontrar o vocabulário de palavras diferente de cada livro entre 2 livros removendo as palavras que forem encontradas nos dois livros;
Rodar o comando: java -jar HtmlToText.jar path_livro
$ java -jar HtmlToText.jar ./HtmlToText/lit2go.ok
Rodar o comando: java -jar SrtToText.jar path_legendas_series
$ java -jar SrtToText.jar ./SrtToText/series
$ hadoop fs -mkdir lit2go.ok #cria diretório para livro
$ hadoop fs -mkdir series #cria diretório para series
$ hadoop fs -put ./result_books/ ./lit2go.ok/ #copia livros para hadoop
$ hadoop fs -put ./result_series/ ./series/ #copia series para hadoop
Entra no Hive e executar:
SET hive.mapred.supports.subdirectories=TRUE;
SET mapred.input.dir.recursive=TRUE;
CREATE EXTERNAL TABLE a_little_princess
(text STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE
LOCATION '/user/root/lit2go.ok/result_books/A_Little_Princess/';
Trocar '/user/root/' pelo seu caminho onde criou a pasta result_books no hadoop
SELECT * FROM a_little_princess limit 5;
CREATE EXTERNAL TABLE dracula
(text STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE
LOCATION '/user/root/lit2go.ok/result_books/Dracula/';
SELECT * FROM dracula limit 5;
CREATE TABLE a_little_princess_word_count AS
SELECT word, (count(*)) wordcount
FROM a_little_princess LATERAL VIEW explode(split(lower(text), '\\W+')) t1 AS word
GROUP BY word
order by word;
SELECT * FROM a_little_princess_word_count limit 5;
CREATE TABLE dracula_word_count AS
SELECT word, (count(*)) wordcount
FROM dracula LATERAL VIEW explode(split(lower(text), '\\W+')) t1 AS word
GROUP BY word
order by word;
SELECT * FROM dracula_word_count limit 5;
select a.word,
a.wordcount AS princess_word_count,
d.wordcount AS dracula_word_count,
(a.wordcount + d.wordcount) AS total
from a_little_princess_word_count a
join dracula_word_count d on( trim(a.word) = trim(d.word) )
where trim(a.word) <> ''
and trim(d.word) <> ''
order by total desc
limit 1500;
Resultado: interseccao.csv
2. Encontrar o vocabulário de palavras diferente de cada livro entre 2 livros removendo as palavras que forem encontradas nos dois livros (disjunção)
select a.word,
a.wordcount,
d.word,
d.wordcount
from a_little_princess_word_count a
FULL join dracula_word_count d on( trim(a.word) = trim(d.word) )
where a.word IS NULL OR d.word IS NULL;
Resultado: disjuncao.csv