Mapreduce实现倒排索引

Alex / 4-12 18:02 / Hadoop / Tag: mapreduce

1.jpg

这里俩个job写在一起,不推荐这样,实际部署中还是分别编写,然后通过shell去指定先后执行顺序

 

public class InverseIndexStepOne {
	
	
	
	public static class StepOneMapper extends Mapper<LongWritable, Text, Text, LongWritable>{
		
		@Override
		protected void map(LongWritable key, Text value,Context context)
				throws IOException, InterruptedException {

			//拿到一行数据
			String line = value.toString();
			//切分出各个单词
			String[] fields = StringUtils.split(line, " ");
			
			//获取这一行数据所在的文件切片
			FileSplit inputSplit = (FileSplit) context.getInputSplit();
			//从文件切片中获取文件名
			String fileName = inputSplit.getPath().getName();
			
			for(String field:fields){
				
				//封装kv输出  ,  k :  hello-->a.txt     v:  1
				context.write(new Text(field+"-->"+fileName), new LongWritable(1));
				
			}
			
		}
		
		
	}
	
	
	public static class StepOneReducer extends Reducer<Text, LongWritable, Text, LongWritable>{
		
		// <hello-->a.txt,{1,1,1....}>
		@Override
		protected void reduce(Text key, Iterable<LongWritable> values,Context context)
				throws IOException, InterruptedException {

			long counter = 0;
			for(LongWritable value:values){
				
				counter += value.get();
				
			}
			
			context.write(key, new LongWritable(counter));
		}
		
		
	}
	
	public static void main(String[] args) throws Exception {

		Configuration conf = new Configuration();	
		Job job = Job.getInstance(conf);
		
		job.setJarByClass(InverseIndexStepOne.class);
		
		job.setMapperClass(StepOneMapper.class);
		job.setReducerClass(StepOneReducer.class);
		
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(LongWritable.class);
		
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		
		//检查一下参数所指定的输出路径是否存在,如果已存在,先删除
		Path output = new Path(args[1]);
		FileSystem fs = FileSystem.get(conf);
		if(fs.exists(output)){
			fs.delete(output, true);
		}
		
		FileOutputFormat.setOutputPath(job, output);
		
		System.exit(job.waitForCompletion(true)?0:1);
		
		
	}

}
public class InverseIndexStepTwo {

	
public static class StepTwoMapper extends Mapper<LongWritable, Text, Text, Text>{
		
	
	    //k: 行起始偏移量    v:  {hello-->a.txt   3} 
		@Override
		protected void map(LongWritable key, Text value,Context context)
				throws IOException, InterruptedException {
			
			String line = value.toString();
			
			String[] fields = StringUtils.split(line, "	");
			String[] wordAndfileName = StringUtils.split(fields[0], "-->");
			
			String word = wordAndfileName[0];
			String fileName = wordAndfileName[1];
			long count = Long.parseLong(fields[1]);
			
			
			context.write(new Text(word), new Text(fileName+"-->"+count));		
			//map输出的结果是这个形式   : <hello,a.txt-->3>
			
		}
}


	public static class StepTwoReducer extends Reducer<Text, Text,Text, Text>{
		
		@Override
		protected void reduce(Text key, Iterable<Text> values,Context context)
				throws IOException, InterruptedException {

			//拿到的数据  <hello,{a.txt-->3,b.txt-->2,c.txt-->1}>
			
			String result = "";
			
			for(Text value:values){
				
				result += value + " ";
			}
			
			context.write(key, new Text(result));
			//输出的结果就是  k: hello   v: a.txt-->3  b.txt-->2  c.txt-->1  
			
		}
		
	}

	public static void main(String[] args) throws Exception {

		Configuration conf = new Configuration();	
		
		//先构造job_one
		Job job_one = Job.getInstance(conf);
		
		job_one.setJarByClass(InverseIndexStepTwo.class);
		job_one.setMapperClass(StepOneMapper.class);
		job_one.setReducerClass(StepOneReducer.class);
		//......
		
		
		//构造job_two
		Job job_tow = Job.getInstance(conf);
		
		job_tow.setJarByClass(InverseIndexStepTwo.class);
		
		job_tow.setMapperClass(StepTwoMapper.class);
		job_tow.setReducerClass(StepTwoReducer.class);
		
		job_tow.setOutputKeyClass(Text.class);
		job_tow.setOutputValueClass(Text.class);
		
		FileInputFormat.setInputPaths(job_tow, new Path(args[0]));
		
		//检查一下参数所指定的输出路径是否存在,如果已存在,先删除
		Path output = new Path(args[1]);
		FileSystem fs = FileSystem.get(conf);
		if(fs.exists(output)){
			fs.delete(output, true);
		}
		
		FileOutputFormat.setOutputPath(job_tow, output);
		
		
		//先提交job_one执行
		boolean one_result = job_one.waitForCompletion(true);
		if(one_result){
		System.exit(job_tow.waitForCompletion(true)?0:1);
		}
		
	}

}

 


发表留言:

Hive SQL优化之 Count Distinct strom集群安装
返回顶部
Themes by lishiyu.cn