当前位置:首页 > 开发 > 系统架构 > 架构 > 正文

hadoop join实现

发表于: 2012-12-31   作者:blackproof   来源:转载   浏览次数:
摘要: hadoop的join实现,实现符合关键字,多对多连接   key:   public class MultiKey implements WritableComparable<MultiKey> { private Text departId = new Text(); private Text departNo = new Text();

hadoop的join实现,实现符合关键字,多对多连接

 

key:

 

public class MultiKey implements WritableComparable<MultiKey> {

	private Text departId = new Text();
	private Text departNo = new Text();

	public Text getDepartId() {
		return departId;
	}

	public void setDepartId(String departId) {
		this.departId = new Text(departId);
	}

	public Text getDepartNo() {
		return departNo;
	}

	public void setDepartNo(String departNo) {
		this.departNo = new Text(departNo);
	}

	@Override
	public void write(DataOutput out) throws IOException {
		departId.write(out);
		departNo.write(out);
	}

	@Override
	public void readFields(DataInput in) throws IOException {
		this.departId.readFields(in);
		this.departNo.readFields(in);
	}

	@Override
	public int compareTo(MultiKey o) {
		return (this.departId.compareTo(o.departId) !=0)? this.departId.compareTo(o.departId) : this.departNo.compareTo(o.departNo);
		
	}
	
	@Override
	public String toString(){
		return this.departId.toString()+" : "+this.departNo.toString();
	}
	
	@Override
	public int hashCode(){
		return 0;
	}
}
 

 

value:

 

public class Employee implements WritableComparable<Employee> {

	private String empName="";
	private String departId="";
	private String departNo="";
	private String departName="";
	private int flag;
	
	public int getFlag() {
		return flag;
	}

	public void setFlag(int flag) {
		this.flag = flag;
	}

	public String getEmpName() {
		return empName;
	}

	public void setEmpName(String empName) {
		this.empName = empName;
	}

	public String getDepartId() {
		return departId;
	}

	public void setDepartId(String departId) {
		this.departId = departId;
	}

	public String getDepartNo() {
		return departNo;
	}

	public void setDepartNo(String departNo) {
		this.departNo = departNo;
	}

	public String getDepartName() {
		return departName;
	}

	public void setDepartName(String departName) {
		this.departName = departName;
	}

	@Override
	public void write(DataOutput out) throws IOException {
		out.writeUTF(this.empName);
		out.writeUTF(this.departId);
		out.writeUTF(this.departNo);
		out.writeUTF(this.departName);
		out.writeInt(this.flag);
	}

	@Override
	public void readFields(DataInput in) throws IOException {
		this.empName = in.readUTF();
		this.departId = in.readUTF();
		this.departNo = in.readUTF();
		this.departName = in.readUTF();
		this.flag = in.readInt();
	}
	
	public static void writeAllProperties(DataOutput out,Class<? extends WritableComparable<?>> type,Object obj) throws IllegalArgumentException, IllegalAccessException{
		Field[] fields =  type.getDeclaredFields();
		for (Field field : fields) {
			System.out.println(field.get(obj));
		}
	}

	@Override
	public int compareTo(Employee o) {
		return 0;
	}
	
	
	@Override
	public String toString(){
		return this.empName+"  "+this.departName;
	}

}
 

 

 

maper:

 

 

public class MyJoinMapper extends Mapper<LongWritable, Text, MultiKey, Employee>{
	@Override
	public void map(LongWritable key,Text value,Context context) throws IOException,InterruptedException{
		String line = value.toString();
		String[] array = line.split(",");
		visit(array,context);
	}
	
	private void visit(String[] array,Context context) throws IOException,InterruptedException{
		int i = Integer.valueOf(array[0]);
		MultiKey key = new MultiKey();
		Employee e = new Employee();
		switch (i) {
		case 1://name
			e.setEmpName(array[1]);
			e.setFlag(1);
			break;
		default://depart
			e.setDepartName(array[1]);
			e.setFlag(2);
			break;
		}
		e.setDepartId(array[2]);
		e.setDepartNo(array[3]);
		key.setDepartId(e.getDepartId());
		key.setDepartNo(e.getDepartNo());
		context.write(key, e);
	}
}
 

 

 

reducer:

 

 

public class MyJoinReducer extends Reducer<MultiKey, Employee, IntWritable, Text>{

	List<emp> empList = new LinkedList<emp>();
	List<depart> departList = new LinkedList<MyJoinReducer.depart>();
	
	@Override
	public void reduce(MultiKey key,Iterable<Employee> values,Context context) throws IOException,InterruptedException{
		for (Employee employee : values) {
			visite(employee);
		}
		System.out.println("----------");
		System.out.println(key);
		for (emp em : empList) {
			for (depart de : departList) {
				Employee e = new Employee();
				e.setDepartId(em.departId);
				e.setDepartName(de.departName);
				e.setDepartNo(em.departNo);
				e.setEmpName(em.empName);
				context.write(new IntWritable(1), new Text(e.toString()));
			}
		}

		empList = new LinkedList<emp>();
		departList = new LinkedList<MyJoinReducer.depart>();
	}
	
	private void visite(Employee e){
		switch (e.getFlag()) {
		case 1:
			emp em = new emp();
			em.departId = e.getDepartId();
			em.departNo = e.getDepartName();
			em.empName = e.getEmpName();
			empList.add(em);
			break;

		default:
			depart de = new depart();
			de.departName = e.getDepartName();
			departList.add(de);
			break;
		}
	}
	
	
	private class emp{
		public String empName;
		public String departId;
		public String departNo;
	}
	
	private class depart{
		public String departName;
	}
	
}

 

comparator

 

public class MyJoinComparator extends WritableComparator{

	protected MyJoinComparator() {
		super(MultiKey.class,true);
	}
	
}
 

 

groupcomparator:

 

 

public class MyJoinGroupComparator implements RawComparator<MultiKey> {

	private DataInputBuffer buffer = new DataInputBuffer();

	@Override
	public int compare(MultiKey key1, MultiKey key2) {
		return key1.compareTo(key2);
	}

	@Override
	public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
		return new MyJoinComparator().compare(b1, s1, l1, b2, s2, l2);
	}

}
 

 

今天iteye的编辑器好坑爹啊,不断的崩溃

 

 

补个测试类

 

public class MyJoinTest {
	
	public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
		upload("dirk1.txt", "dirk.txt");
		upload("dirk2.txt","dirk2.txt");
		delete();
		Configuration conf = new Configuration();
		Job job = new Job(conf, "joinJob");
		job.setMapperClass(MyJoinMapper.class);
		job.setReducerClass(MyJoinReducer.class);
		job.setMapOutputKeyClass(MultiKey.class);
		job.setMapOutputValueClass(Employee.class);
		job.setOutputKeyClass(IntWritable.class);
		job.setOutputValueClass(Text.class);
		job.setGroupingComparatorClass(MyJoinGroupComparator.class);
		FileInputFormat.addInputPath(job, new Path("/user/dirk3/input"));
		FileOutputFormat.setOutputPath(job, new Path("/user/dirk3/output"));
		job.waitForCompletion(true);
	}
	
	public static void upload(String local,String remote) throws IOException{
		Configuration conf = new Configuration();
		FileSystem fs = FileSystem.get(conf);
		String l = MyJoinTest.class.getResource("").getPath()+"/"+local;
		fs.copyFromLocalFile(false, true, new Path(l), new Path("/user/dirk3/input/"+remote));
	}
	
	public static void delete() throws IOException {
		Configuration conf = new Configuration();
		FileSystem fs = FileSystem.get(conf);
		fs.delete(new Path("/user/dirk3/output"), true);
	}
	
	public static void run(){
		JobConf jobConf = new JobConf();
		jobConf.setOutputKeyComparatorClass(MyJoinComparator.class);
		jobConf.setOutputValueGroupingComparator(MyJoinComparator.class);
	}

}
 

join的主要实现在reducer中

关于comparator,在通过maper向context中添加key value后,通过combine,partition之后,进入reducer阶段,进行groupComparator,决定哪些key同时进入一个reducer

 

hadoop join实现

  • 0

    开心

    开心

  • 0

    板砖

    板砖

  • 0

    感动

    感动

  • 0

    有用

    有用

  • 0

    疑问

    疑问

  • 0

    难过

    难过

  • 0

    无聊

    无聊

  • 0

    震惊

    震惊

编辑推荐
概念: Hadoop有一个叫DataJoin的包为Data Join提供相应的框架。它的Jar包存在于contrib/datajoin/h
Apache Spark探秘:实现Map-side Join和Reduce-side Join Category: Frameworks (Spark) On YARN Vi
上篇,散仙介绍了基于Hadoop的旧版API结合DataJoin工具类和MapReduce实现的侧连接,那么本次,散仙就
上篇,散仙介绍了基于Hadoop的旧版API结合DataJoin工具类和MapReduce实现的侧连接,那么本次,散仙就
上篇,散仙介绍了基于Hadoop的旧版API结合DataJoin工具类和MapReduce实现的侧连接,那么本次,散仙就
上篇,散仙介绍了基于Hadoop的旧版API结合DataJoin工具类和MapReduce实现的侧连接,那么本次,散仙就
上篇,散仙介绍了基于Hadoop的旧版API结合DataJoin工具类和MapReduce实现的侧连接,那么本次,散仙就
上篇,散仙介绍了基于Hadoop的旧版API结合DataJoin工具类和MapReduce实现的侧连接,那么本次,散仙就
上篇,散仙介绍了基于Hadoop的旧版API结合DataJoin工具类和MapReduce实现的侧连接,那么本次,散仙就
Reduce-side joining / repartitioned sort-merge join Note:DataJoinReducerBase, on the other ha
版权所有 IT知识库 CopyRight © 2009-2015 IT知识库 IT610.com , All Rights Reserved. 京ICP备09083238号