简体   繁体   中英

Python Spark Find Top 5 wiki pages of each type

I have been learning python spark and having a small issue. I have a list of wiki pages their language and their views I am trying to the top 5 pages by views in each language. I have the pages filtered and grouped but having issues using the .top() function. So far I have

# --------------------------------------------------------
#           PYTHON PROGRAM
# Here is where we are going to define our set of...
# - Imports
# - Global Variables
# - Functions
# ...to achieve the functionality required.
# When executing > python 'this_file'.py in a terminal,
# the Python interpreter will load our program,
# but it will execute nothing yet.
# --------------------------------------------------------

import sys
import codecs

def process_line(line):

    # Tokenize words
    tokens = line.split(' ')

    # Check for the arabic case, thus the tokens
    # will be entered in a opposite index
    if tokens[0].isdigit():
        tokens = tokens[::-1]

    # Return dic
    rtn = ((tokens[0], tokens[1]), tokens[2])

    return rtn

# Filter function to check if the record
# is a member of the required langs set
def my_filter_function(key, languages):
    res = False

    for l in languages:
      if key.startswith(l):
        res = True
        break

    return res


# ------------------------------------------
# FUNCTION my_main
# ------------------------------------------
def my_main(dataset_dir, o_file_dir, languages, num_top_entries):

  inputRDD = sc.textFile(dataset_dir)
  allWordsRDD = inputRDD.map(lambda x: process_line(x))
  reducedRDD = allWordsRDD.filter(lambda (x, y): my_filter_function(x[0], languages))\
               .reduceByKey(lambda x, y: x + y)\
               .map(lambda (x, y): (x[0], (x[1], y)))\
               .groupByKey()\
               .map(lambda x : (x[0], list(x[1])))\

  resVAL = reducedRDD.collect()
  for item in resVAL:
    print(item)
  return


# ---------------------------------------------------------------
#           PYTHON EXECUTION
# This is the main entry point to the execution of our program.
# It provides a call to the 'main function' defined in our
# Python program, making the Python interpreter to trigger
# its execution.
# ---------------------------------------------------------------
if __name__ == '__main__':
    dataset_dir = "/FileStore/tables/A01_my_dataset"
    o_file_dir = "/FileStore/tables/A01_my_result/"

    languages = ["en", "es", "fr"]
    num_top_entries = 5

    dbutils.fs.rm(o_file_dir, True)


    my_main(dataset_dir, o_file_dir, languages, num_top_entries)

This works great which gives my by results in groups with the tuples. So an example output it

(u'es.v', [(u'Termoendurecibles_o_termoestables', u'1'), (u'Psicolog\xeda_forense/La_v\xedctima', u'1'), (u'Zoolog\xeda_de_los_artr\xf3podos/Sistema_respiratorio', u'1'), (u'Categor\xeda:Electr\xf3nica', u'1'), (u'L\xf3gica_proposicional/La_implicaci\xf3n', u'1'), (u'Factorizaci\xf3n', u'5'), (u'Rectificaci\xf3n_de_media_onda_no_controlada_con_carga_resistiva-inductiva', u'1'), (u'Laboratorio_de_Tecnolog\xeda_Electr\xf3nica/Medida_del_desfase', u'2'), (u'Discapacidad_auditiva', u'1'), (u'Partes_de_una_guitarra', u'1'), (u'L\xf3gica_proposicional', u'1'), (u'Ingenier\xeda_de_requisitos_software', u'1'), (u'Vidrios_cer\xe1micos', u'1'), (u'Qu\xedmica_org\xe1nica', u'1'), (u'Relatividad_Especial', u'1'), (u'Eficacia_de_la_comunicaci\xf3n', u'1'), (u'Taller_de_f\xedsica/Tercera_ley_de_Newton_del_movimiento', u'2'), (u'Ortograf\xeda/Abreviaciones', u'1'), (u'Sucesiones_num\xe9ricas._Progresiones', u'1'), (u'Estructura_del_computador', u'1'), (u'Planes_de_estudio_de_Matem\xe1tica', u'1'), (u'Transformada_de_Fourier_discreta', u'1'), (u'Psicolog\xeda_forense/La_presentaci\xf3n_de_resultados', u'1'), (u'Qu\xedmica_inorg\xe1nica/Hidr\xf3geno', u'1'), (u'L\xf3gica_proposicional/La_conjunci\xf3n/Evaluaci\xf3n', u'1'), (u'Taxonom\xeda_y_clasificaci\xf3n_de_los_seres_vivos', u'1'), (u'Modelos_at\xf3micos/Modelo_de_B\xf6hr', u'1'), (u'Procesamiento_y_herramientas_de_edici\xf3n_de_video', u'1'), (u'Macroeconom\xeda_intermedia/Oferta_agregada', u'1'), (u'Metodolog\xeda_de_la_Investigaci\xf3n', u'1'), (u'Curso_pr\xe1ctico_para_aprender_a_leer_m\xfasica_(Solfeo)', u'4'), (u'Rectificaci\xf3n_de_media_onda_no_controlada_con_carga_resistiva', u'3'), (u'Evaluaci\xf3n_de_las_fuentes_de_informaci\xf3n_en_la_web', u'2'), (u'Trinomio_cuadrado_perfecto', u'2'), (u'Principios_del_procesado_de_los_pol\xedmeros', u'3'), (u'Microorganismos_perjudiciales_y_beneficiosos', u'2'), (u'Categor\xeda:Qu\xedmica', u'1'), (u'Redes_de_datos', u'1'), (u'El_periodo_de_las_operaciones_concretas_(7-11_a\xf1os)_de_Piaget', u'2'), (u'N\xfameros_naturales/Potenciaci\xf3n', u'1'), (u'Derecho_Penal_General', u'1'), (u'Correspondencia_biun\xedvoca', u'1'), (u'Vocabulario_de_ingl\xe9s_por_fotos/Frutas', u'1'), (u'Fundamentos_de_programaci\xf3n/Variables_y_asignaciones', u'1'), (u'Herramientas_de_edici\xf3n_de_imagen', u'1'), (u'Gallego', u'1'), (u'Equilibrio_de_precipitaci\xf3n', u'2'), (u'Categor\xeda:Procesado_de_se\xf1ales', u'1'), (u'Entrenamiento_deportivo/Principios_fundamentales', u'3'), (u'N\xfameros_naturales/Propiedades_y_relaciones_de_orden', u'1'), (u'Los_cuatro_pilares_de_la_educaci\xf3n', u'4'), (u'Potencias_y_ra\xedces_de_n\xfameros_reales', u'1'), (u'Filosof\xeda_antigua', u'3'), (u'Caracterizaci\xf3n_de_se\xf1ales', u'3'), (u'Poblaci\xf3n_y_muestra', u'1'), (u'Introducci\xf3n_a_la_psicolog\xeda/Procesos_psicol\xf3gicos_b\xe1sicos', u'1'), (u'Sistemas_de_almacenamiento', u'2'), (u'Principales_conjuntos_num\xe9ricos', u'1'), (u'Derecho_Procesal_Penal', u'2'), (u'F\xf3rmula_general_(matem\xe1ticas)', u'2'), (u'Introducci\xf3n_a_la_cinematograf\xeda/Definici\xf3n_e_historia', u'1'), .......

My issue is how can I run the top command on this data. I was trying .top(5, key=lambda x: x[2]) . But did not work. Any help would be great thanks. Oh and If I am going about this all wrong please do tell me.

You can use window functions create a row_number for each partition(in your case each language is a partition).

from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, col

window = Window.partitionBy(df['language']).orderBy(df['view'].desc())

And then you could do

val n = 5
df.select(col('*'), row_number().over(window).alias('row_number')) \
.where(col('row_number') <= n) 

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM