twitterから簡単にfollowing、followersの一覧を得る方法として、Rがあるようです。
iGraphでTwitterのネットワークを表してみる
 igraphでTwitterのネットワークを描いてみる２
しかし、100人までしか取得できないし、僕自身がRに慣れていないので、自分でスクリプトを書いてみました。

#!/usr/bin/php
<?php
require_once 'Services/Twitter.php';

$user='username'; // ユーザ名
$pass='password'; // パスワード
$max_loop=10; // プログラムにバグがあったときの保険
$delay=10; // 1リクエスト後のスリープ時間(秒)

class Services_Twitter2 extends Services_Twitter {
  public $followers_url=
           'http://twitter.com/statuses/followers/%s.json?cursor=%.0f';
  public $friends_url=
           'http://twitter.com/statuses/friends/%s.json?cursor=%.0f';

  function __construct($user,$pass) {
    parent::__construct($user,$pass);
  }

  function getAllFollowersFrom($user,$cursor=-1) {
    return $this->_getData('GET',
                           sprintf($this->followers_url,$user,$cursor),
                           'getAllFollowersFrom');
  }

  function getAllFriendsFrom($user,$cursor=-1) {
    return $this->_getData('GET',
                           sprintf($this->friends_url,$user,$cursor),
                           'getAllFriendsFrom');
  }
}

function usage() {
  fprintf(STDERR,"Usage: getwitter [-fo | -fr]\n");
  exit(1);
}

if ($argc===2) {
  if ($argv[1]==='-fo') {
    $inquire='getAllFollowersFrom';
  }
  else if ($argv[1]==='-fr') {
    $inquire='getAllFriendsFrom';
  }
  else {
    usage();
  }
}
else {
  usage();
}

$st=&new Services_Twitter2($user,$pass);

// クライアント情報。適当でいい
$st->setAppName('getwitter');
$st->setAppVersion('0.0.1');

while (($target=fgets(STDIN))!==FALSE) {
  $results=array();
  $target=rtrim($target);
  $people_json=$st->$inquire($target);
  $people=json_decode($people_json);
  for ($i=0;$i<$max_loop && !empty($people);$i++) {
    if (count($people->{'users'})===0) {
      sleep($delay);
      break;
    }
    foreach ($people->{'users'} as $person) {
      /*
       * 100人を超えて取得する場合、何故かたまに重複する
       */
      if (!isset($results[$person->{'screen_name'}])) {
        fprintf(STDERR,$target."\t".$person->{'screen_name'}."\n");
        $results[$person->{'screen_name'}]=1;
      }
    }
    if ($people->{'next_cursor'}===0) { // もう次はない
      sleep($delay);
      break;
    }
    sleep($delay);
    $people_json=$st->$inquire($target,$people->{'next_cursor'});
    $people=json_decode($people_json);
  }
}
?>

Services_Twitterを使っているので、事前にインストールしておいてください。

このスクリプトは標準入力から一覧を得たいユーザ名を入力し、「標準エラー出力」に結果を出力します。標準出力にはよくわからないメッセージがServices_Twitterから出るので出力を分けました。
followingの一覧を出力したい場合は「-fr」オプションを付け、followersの一覧を出力したい場合は「-fo」オプションを付けます。
結果の形式は次の通りです。

ユーザ名1\t人1
ユーザ名1\t人2
ユーザ名1\t人3
.
.
.

人1、2、3というのはfollowing、またはfollowers一覧に名前がある人のことです。
なお、動かす前に$user、$pass、$max_loop、$delayを設定しておいてください。

使用例を挙げておきます。
入力用のファイル
標準入力からユーザ名を入力できるので、ファイルに書いておくと便利です。

ユーザ名1
ユーザ名2
ユーザ名3
.
.
.

followingの一覧を取得
bashの場合

getwitter.php -fr < 入力ファイル 1> /dev/null 2> 出力ファイル

followersの一覧を取得
bashの場合

getwitter.php -fo < 入力ファイル 1> /dev/null 2> 出力ファイル

スクリプト自体は単純だと思うので、是非用途に応じて改造してください!

2009-12-13

すかすかの配列から効率良く全要素を取り出す

アルゴリズムプログラミング C言語

一番右端に立っているビットの位置を求めるアルゴリズムが載っていました。

http://d.hatena.ne.jp/siokoshou/20090704#p1
http://chessprogramming.wikispaces.com/BitScan#DeBruijnMultiplation

まさしく黒魔術!良くこんなの思いつくなぁ、と感心する事しきりなのですが、ふと、これを使えばすかすかの配列から効率良く全要素を取り出す事ができるのではないかと思い、やってみました。

#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>

typedef unsigned long long U64;

/*
  * 実行時間測定用
  */
double current_time,start_time;

void print_time(const char *mess) {
  struct timeval tv;

  gettimeofday(&tv,NULL);
  current_time=tv.tv_sec+tv.tv_usec*1e-6;
  fprintf(stderr,"%s: %-.5f\n",mess,current_time-start_time);
  start_time=current_time;
}

void set_timer() {
  struct timeval tv;

  gettimeofday(&tv,NULL);
  start_time=tv.tv_sec+tv.tv_usec*1e-6;
}

/*
  *  黒魔術
  */
const int index64[64] = {
   63,  0, 58,  1, 59, 47, 53,  2,
   60, 39, 48, 27, 54, 33, 42,  3,
   61, 51, 37, 40, 49, 18, 28, 20,
   55, 30, 34, 11, 43, 14, 22,  4,
   62, 57, 46, 52, 38, 26, 32, 41,
   50, 36, 17, 19, 29, 10, 13, 21,
   56, 45, 25, 31, 35, 16,  9, 12,
   44, 24, 15,  8, 23,  7,  6,  5
};

int bitScanForward(U64 bb) {
  const U64 debruijn64=0x07EDD5E59A4E28C2ULL;
  if (bb==0) {
    return 0;
  }
  return index64[((bb&-bb)*debruijn64)>>58];
}

int main(int argc,char *argv[]) {
  const unsigned int length_of_array=51200000;
  const unsigned int length_of_indexes=length_of_array/64;
  U64 *indexes=calloc(length_of_indexes,sizeof(U64));
  int *array=calloc(length_of_array,sizeof(int));
  int i;

  set_timer();
  for (i=0;i<atoi(argv[1]);i++) {
    int j,k,l;

    scanf("%d",&j);
    array[j]=1;
    k=j/64;
    l=j-64*k;
    /*
      * indexesの各要素は64個の添字を管理する。
      * j==32->indexes[0]の33bit目が立つ。
      * j==72->indexes[1]の9bit目が立つ、と言った具合
      */
    indexes[k]|=(1ULL<<l);
  }
  print_time("input"); // 入力にかかった時間
  for (i=0;i<length_of_indexes;i++) {
    int j;

    for (j=0;0ULL<indexes[i];j++) {
      int len=bitScanForward(indexes[i]);

      j+=len;
      printf("%d\n",j+64*i);
      if (len<63) {
        indexes[i]>>=len+1;
      }
      else {
        break;
      }
    }
  }
  print_time("print"); // 出力にかかった時間
  return 0;
}

argvの扱いや入力部分がいい加減ですが御勘弁を。
次のプログラムは比較用のプログラムです。単純に配列を先頭から調べているだけです。

#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>

/*
  * 実行時間測定用
  */
double current_time,start_time;

void print_time(const char *mess) {
  struct timeval tv;

  gettimeofday(&tv,NULL);
  current_time=tv.tv_sec+tv.tv_usec*1e-6;
  fprintf(stderr,"%s: %-.5f\n",mess,current_time-start_time);
  start_time=current_time;
}

void set_timer() {
  struct timeval tv;

  gettimeofday(&tv,NULL);
  start_time=tv.tv_sec+tv.tv_usec*1e-6;
}

int main(int argc,char *argv[]) {
  const unsigned int length_of_array=51200000;
  int *array=calloc(length_of_array,sizeof(int)),i;

  set_timer();
  for (i=0;i<atoi(argv[1]);i++) {
    int j;

    scanf("%d",&j);
    array[j]=1;
  }
  print_time("input"); // 入力にかかった時間
  for (i=0;i<length_of_array;i++) {
    if (array[i]!=0) {
      printf("%d\n",i);
    }
  }
  print_time("print"); // 出力にかかった時間
  return 0;
}

ユーザ空間での実行時間

入力要素数	黒魔術	単純探索
10000	0.037s	0.166s
100000	0.201s	0.268s
1000000	1.128s	1.044s
10000000	10.125s	8.737s

入力にかかった時間

入力要素数	黒魔術	単純探索
10000	0.088s	0.073s
100000	0.411s	0.381s
1000000	1.103s	0.942s
10000000	7.546s	5.981s

出力にかかった時間

入力要素数	黒魔術	単純探索
10000	0.007s	0.321s
100000	0.037s	0.152s(減ってる???バグ???)
1000000	0.323s	0.393s
10000000	3.064s	3.210s

要素が多くなると全体でかかった時間は黒魔術の方が長くなります。しかし、出力にかかった時間はどれも黒魔術の方が短いので、要素が多くなるとindexesに格納する所が脚を引っ張ることがわかります。
使えば速くなる、って訳でもないですね・・・。

2009-12-06

自動拡張する文字列用バッファ

プログラミング C言語 C言語の便利な関数

GString

#include <glib.h>
typedef struct {
  gchar  *str;
  gsize len;
  gsize allocated_len;
} GString;
GString* g_string_new(const gchar *init);
GString* g_string_append(GString *string, const gchar *val);
GString* g_string_prepend(GString *string, const gchar *val);
GString* g_string_insert(GString *string, gssize pos, const gchar *val);
GString* g_string_erase(GString *string, gssize pos, gssize len);
gchar* g_string_free(GString *string, gboolean free_segment);

GStringは、GLibに含まれている文字列用バッファです。文字列の追加、挿入、削除が容易に行えます。

gchar* g_string_free(GString *string, gboolean free_segment);

stringを解放します。free_segmentをTRUEにすると、string->strも解放されます。
なお、GString->strは文字列を追加した際にアドレスが変わる可能性があるので注意しましょう。

使い方

#include <glib.h>
#include <stdio.h>

int main() {
  GString *gstring;

  gstring=g_string_new("def");
  g_string_append(gstring,"ghi");
  g_string_prepend(gstring,"abc");
  g_string_insert(gstring,3,"__");
  printf("%s\n",gstring->str);
  g_string_free(gstring,TRUE);
  return 0;
}

コンパイル

プログラムが書かれたファイル名をtest.cとします。

gcc test.c `pkg-config --cflags glib-2.0` `pkg-config --libs glib-2.0`

実行結果

abc__defghi

stralloc

#include <djbdns/stralloc.h>
typedef struct stralloc {
  char *s;
  unsigned int len;
  unsigned int a;
} stralloc;
int stralloc_copy(stralloc *a,const stralloc *b);
int stralloc_copys(stralloc *a,const char *b);
int stralloc_cat(stralloc *a,const stralloc *b);
int stralloc_cats(stralloc *a,const char *b);
int stralloc_copyb(stralloc *a,const char *b,unsigned int n);
int stralloc_catb(stralloc *a,const char *b,unsigned int n);
int stralloc_append(stralloc *a,const char *b);

#include <djbdns/alloc.h>
void alloc_free(char *x);

strallocはdjbdnsに含まれている構造体です。Debianの場合、libdjbdns1-devというパッケージになっています。

int stralloc_copy(stralloc *a,const stralloc *b);
int stralloc_copys(stralloc *a,const char *b);

bの文字列(ヌル文字は含みません)でaを上書きします。aの長さはbと同じになります。aの文字列にはヌル文字が追加されないことに注意してください。
成功すると1を返し、失敗すると0を返します。

int stralloc_cat(stralloc *a,const stralloc *b);
int stralloc_cats(stralloc *a,const char *b);

bの文字列(ヌル文字は含みません)をaに追加します。この関数は単純に追加するだけなので、aがヌル文字で終わっている場合はヌル文字の後ろからbが追加されます。また、追加後、aの文字列にはヌル文字が追加されないことに注意してください。
成功すると1を返し、失敗すると0を返します。

int stralloc_copyb(stralloc *a,const char *b,unsigned int n);

n文字分のbの文字列でaを上書きします。nがbの長さを超えないように気をつけましょう。
aの長さはnになります。aの文字列にはヌル文字が追加されないことに注意してください。
なお、nにヌル文字も含めた長さを渡すと、aにヌル文字が付加されます。
成功すると1を返し、失敗すると0を返します。

int stralloc_catb(stralloc *a,const char *b,unsigned int n);

n文字分のbの文字列をaに追加します。nがbの長さを超えないように気をつけましょう。
この関数は単純に追加するだけなので、aがヌル文字で終わっている場合はヌル文字の後ろからbが追加されます。また、追加後、aの文字列にはヌル文字が追加されないことに注意してください。
なお、nにヌル文字も含めた長さを渡すと、aにヌル文字が付加されます。
成功すると1を返し、失敗すると0を返します。

int stralloc_append(stralloc *a,const char *b);

bの先頭の一文字をaに追加します。aがヌル文字で終わっている場合はヌル文字の後ろからbが追加されます。また、追加後、aの文字列にはヌル文字が追加されないことに注意してください。
成功すると1を返し、失敗すると0を返します。

void alloc_free(char *x);

strallocは、djbdnsに含まれているalloc()でメモリを確保します。alloc()は静的に確保された領域を持っており、最初はその領域を返します。そして、静的に確保された領域を使い切るとmalloc()を使って動的に領域を確保します。
alloc_free()は、渡されたポインタが静的に確保された領域なのか、動的に確保された領域なのかを判別し、動的に確保された領域の場合はfree()を呼んで解放します。

使い方

#include <djbdns/alloc.h>
#include <djbdns/stralloc.h>
#include <stdio.h>

int main() {
  stralloc a={0};

  stralloc_copys(&a,"abcdef");
  stralloc_append(&a,""); // ヌル文字を追加
  printf("%s\n",a.s);
  stralloc_copys(&a,"ghi");
  stralloc_catb(&a,"jklmn",6); // ヌル文字も含めて追加
  printf("%s\n",a.s);
  alloc_free(a.s);
  return 0;
}

コンパイル

プログラムが書かれたファイル名をtest.cとします。

gcc -ldjbdns test.c

実行結果

abcdef
ghijklmn