2013-08-09 3 views
1

Я хочу выполнить команду pig в встроенной программе java. На мгновение я попробую Свинья в локальном режиме. Мой размер файла данных составляет около 15 МБ, но выполнение этой команды очень долго, так что я думаю, что мой сценарий нужно оптимизаций ...Оптимизируйте запрос свиньи

Мой сценарий:

A = LOAD 'data' USING PigPrismeLoader('data.xml'); 
filter_response_time_less_than_1_s = FILTER A BY (response_time < 1000.0); 
filter_response_time_between_1_s_and_2_s = FILTER A BY (response_time >= 1000.0 AND response_time < 1999.0); 
filter_response_time_between_greater_than_2_s = FILTER A BY (response_time >= 2000.0); 
star__zne_asfo_access_log = FOREACH (COGROUP A BY (date_day,url,date_minute,ret_code,serveur), filter_response_time_between_greater_than_2_s BY (date_day,url,date_minute,ret_code,serveur), filter_response_time_less_than_1_s BY (date_day,url,date_minute,ret_code,serveur), filter_response_time_between_1_s_and_2_s BY (date_day,url,date_minute,ret_code,serveur)) 
{ 
     GENERATE 
       FLATTEN(group) AS (date_day,zne_asfo_url,date_minute,zne_http_code,zne_asfo_server), 
       (long)SUM((bag{tuple(long)})A.response_time) AS response_time, 
       COUNT(filter_response_time_less_than_1_s) AS response_time_less_than_1_s, 
       COUNT(filter_response_time_between_1_s_and_2_s) AS response_time_between_1_s_and_2_s, 
       COUNT(filter_response_time_between_greater_than_2_s) AS response_time_between_greater_than_2_s, 
       COUNT(A) AS nb_hit; 
}; 
agg__zne_asfo_access_log_ymd = FOREACH (COGROUP A BY (date_day,date_year,date_month), filter_response_time_between_greater_than_2_s BY (date_day,date_year,date_month), filter_response_time_less_than_1_s BY (date_day,date_year,date_month), filter_response_time_between_1_s_and_2_s BY (date_day,date_year,date_month)) 
{ 
     GENERATE 
       FLATTEN(group) AS (date_day,date_year,date_month), 
       (long)SUM((bag{tuple(long)})A.response_time) AS response_time, 
       COUNT(filter_response_time_less_than_1_s) AS response_time_less_than_1_s, 
       COUNT(filter_response_time_between_1_s_and_2_s) AS response_time_between_1_s_and_2_s, 
       COUNT(filter_response_time_between_greater_than_2_s) AS response_time_between_greater_than_2_s, 
       COUNT(A) AS nb_hit; 
}; 
agg__zne_asfo_access_log_ymd_ret_url = FOREACH (COGROUP A BY (date_day,url,date_year,date_month), filter_response_time_between_greater_than_2_s BY (date_day,url,date_year,date_month), filter_response_time_less_than_1_s BY (date_day,url,date_year,date_month), filter_response_time_between_1_s_and_2_s BY (date_day,url,date_year,date_month)) 
{ 
     GENERATE 
       FLATTEN(group) AS (date_day,zne_asfo_url,date_year,date_month), 
       (long)SUM((bag{tuple(long)})A.response_time) AS response_time, 
       COUNT(filter_response_time_less_than_1_s) AS response_time_less_than_1_s, 
       COUNT(filter_response_time_between_1_s_and_2_s) AS response_time_between_1_s_and_2_s, 
       COUNT(filter_response_time_between_greater_than_2_s) AS response_time_between_greater_than_2_s, 
       COUNT(A) AS nb_hit; 
}; 
agg__zne_asfo_access_log_ymd_ret_code = FOREACH (COGROUP A BY (date_day,ret_code,date_year,date_month), filter_response_time_between_greater_than_2_s BY (date_day,ret_code,date_year,date_month), filter_response_time_less_than_1_s BY (date_day,ret_code,date_year,date_month), filter_response_time_between_1_s_and_2_s BY (date_day,ret_code,date_year,date_month)) 
{ 
     GENERATE 
       FLATTEN(group) AS (date_day,zne_http_code,date_year,date_month), 
       (long)SUM((bag{tuple(long)})A.response_time) AS response_time, 
       COUNT(filter_response_time_less_than_1_s) AS response_time_less_than_1_s, 
       COUNT(filter_response_time_between_1_s_and_2_s) AS response_time_between_1_s_and_2_s, 
       COUNT(filter_response_time_between_greater_than_2_s) AS response_time_between_greater_than_2_s, 
       COUNT(A) AS nb_hit; 
}; 
agg__zne_asfo_access_log_ymd_ret_url_server = FOREACH (COGROUP A BY (date_day,url,date_year,date_month,serveur), filter_response_time_between_greater_than_2_s BY (date_day,url,date_year,date_month,serveur), filter_response_time_less_than_1_s BY (date_day,url,date_year,date_month,serveur), filter_response_time_between_1_s_and_2_s BY (date_day,url,date_year,date_month,serveur)) 
{ 
     GENERATE 
       FLATTEN(group) AS (date_day,zne_asfo_url,date_year,date_month,zne_asfo_server), 
       (long)SUM((bag{tuple(long)})A.response_time) AS response_time, 
       COUNT(filter_response_time_less_than_1_s) AS response_time_less_than_1_s, 
       COUNT(filter_response_time_between_1_s_and_2_s) AS response_time_between_1_s_and_2_s, 
       COUNT(filter_response_time_between_greater_than_2_s) AS response_time_between_greater_than_2_s, 
       COUNT(A) AS nb_hit; 
}; 
agg__zne_asfo_access_log_ymd_ret_code_server = FOREACH (COGROUP A BY (date_day,ret_code,date_year,date_month,serveur), filter_response_time_between_greater_than_2_s BY (date_day,ret_code,date_year,date_month,serveur), filter_response_time_less_than_1_s BY (date_day,ret_code,date_year,date_month,serveur), filter_response_time_between_1_s_and_2_s BY (date_day,ret_code,date_year,date_month,serveur)) 
{ 
     GENERATE 
       FLATTEN(group) AS (date_day,zne_http_code,date_year,date_month,zne_asfo_server), 
       (long)SUM((bag{tuple(long)})A.response_time) AS response_time, 
       COUNT(filter_response_time_less_than_1_s) AS response_time_less_than_1_s, 
       COUNT(filter_response_time_between_1_s_and_2_s) AS response_time_between_1_s_and_2_s, 
       COUNT(filter_response_time_between_greater_than_2_s) AS response_time_between_greater_than_2_s, 
       COUNT(A) AS nb_hit; 
}; 
agg__zne_asfo_access_log_ymdi_server = FOREACH (COGROUP A BY (date_day,date_minute,date_year,date_month,serveur), filter_response_time_between_greater_than_2_s BY (date_day,date_minute,date_year,date_month,serveur), filter_response_time_less_than_1_s BY (date_day,date_minute,date_year,date_month,serveur), filter_response_time_between_1_s_and_2_s BY (date_day,date_minute,date_year,date_month,serveur)) 
{ 
     GENERATE 
       FLATTEN(group) AS (date_day,date_minute,date_year,date_month,zne_asfo_server), 
       (long)SUM((bag{tuple(long)})A.response_time) AS response_time, 
       COUNT(filter_response_time_less_than_1_s) AS response_time_less_than_1_s, 
       COUNT(filter_response_time_between_1_s_and_2_s) AS response_time_between_1_s_and_2_s, 
       COUNT(filter_response_time_between_greater_than_2_s) AS response_time_between_greater_than_2_s, 
       COUNT(A) AS nb_hit; 
}; 
agg__zne_asfo_access_log_ymdhi_url = FOREACH (COGROUP A BY (date_day,url,date_minute,date_year,date_month), filter_response_time_between_greater_than_2_s BY (date_day,url,date_minute,date_year,date_month), filter_response_time_less_than_1_s BY (date_day,url,date_minute,date_year,date_month), filter_response_time_between_1_s_and_2_s BY (date_day,url,date_minute,date_year,date_month)) 
{ 
     GENERATE 
       FLATTEN(group) AS (date_day,zne_asfo_url,date_minute,date_year,date_month), 
       (long)SUM((bag{tuple(long)})A.response_time) AS response_time, 
       COUNT(filter_response_time_less_than_1_s) AS response_time_less_than_1_s, 
       COUNT(filter_response_time_between_1_s_and_2_s) AS response_time_between_1_s_and_2_s, 
       COUNT(filter_response_time_between_greater_than_2_s) AS response_time_between_greater_than_2_s, 
       COUNT(A) AS nb_hit; 
}; 
agg__zne_asfo_access_log_ymdhi = FOREACH (COGROUP A BY (date_day,date_minute,date_year,date_month), filter_response_time_between_greater_than_2_s BY (date_day,date_minute,date_year,date_month), filter_response_time_less_than_1_s BY (date_day,date_minute,date_year,date_month), filter_response_time_between_1_s_and_2_s BY (date_day,date_minute,date_year,date_month)) 
{ 
     GENERATE 
       FLATTEN(group) AS (date_day,date_minute,date_year,date_month), 
       (long)SUM((bag{tuple(long)})A.response_time) AS response_time, 
       COUNT(filter_response_time_less_than_1_s) AS response_time_less_than_1_s, 
       COUNT(filter_response_time_between_1_s_and_2_s) AS response_time_between_1_s_and_2_s, 
       COUNT(filter_response_time_between_greater_than_2_s) AS response_time_between_greater_than_2_s, 
       COUNT(A) AS nb_hit; 
}; 
STORE star__zne_asfo_access_log INTO 'star__zne_asfo_access_log' USING PigStorage('\t', '-schema'); 
STORE agg__zne_asfo_access_log_ymd INTO 'agg__zne_asfo_access_log_ymd' USING PigStorage('\t', '-schema'); 
STORE agg__zne_asfo_access_log_ymd_ret_url INTO 'agg__zne_asfo_access_log_ymd_ret_url' USING PigStorage('\t', '-schema'); 
STORE agg__zne_asfo_access_log_ymd_ret_code INTO 'agg__zne_asfo_access_log_ymd_ret_code' USING PigStorage('\t', '-schema'); 
STORE agg__zne_asfo_access_log_ymd_ret_url_server INTO 'agg__zne_asfo_access_log_ymd_ret_url_server' USING PigStorage('\t', '-schema'); 
STORE agg__zne_asfo_access_log_ymd_ret_code_server INTO 'agg__zne_asfo_access_log_ymd_ret_code_server' USING PigStorage('\t', '-schema'); 
STORE agg__zne_asfo_access_log_ymdi_server INTO 'agg__zne_asfo_access_log_ymdi_server' USING PigStorage('\t', '-schema'); 
STORE agg__zne_asfo_access_log_ymdhi_url INTO 'agg__zne_asfo_access_log_ymdhi_url' USING PigStorage('\t', '-schema'); 
STORE agg__zne_asfo_access_log_ymdhi INTO 'agg__zne_asfo_access_log_ymdhi' USING PigStorage('\t', '-schema'); 

Любые идеи?

+2

Общее замечание о Hadoop- не было сделано для работы на небольших наборах данных, а файл длиной 15 МБ - это крошечное пятнышко данных по сравнению с тем, что он должен был обрабатывать. – so13eit

ответ

1

Возможно, вам понадобится оптимизация вашего сценария, но, как сказано в комментариях, это крошечное пятнышко данных для Hadoop.

Hadoop не подходит для таких небольших данных (даже до гигабайт).

Это потому, что Hadoop, предназначенный для обработки огромного количества данных, включает в себя сложную структуру обработки, которая требует времени для настройки. Если вы рассматриваете большой набор данных, это время настройки незначительно, но если вы работаете с 15 МБ данных, настройка структуры займет гораздо больше времени, чем фактическая обработка этих данных.

+0

Да, я знаю, но небольшой файл может генерировать много вычислений. С этим файлом у меня уже есть ошибка в области кучи (4 ГБ), поэтому я считаю, что мой скрипт нуждается в оптимизации. Кто-то может сказать мне, что плохо в моем сценарии? – 35niavlys

+0

никто не имеет идеи? – 35niavlys

Смежные вопросы